Detect and break redirection loops.

Unquote text before connection check and GET request. Signed-off-by: Arun Prakash Jana <engineerarun@gmail.com>
2016-03-25 11:47:20 +05:30 · 2016-03-25 11:47:20 +05:30 · a56fa0381d
commit a56fa0381d
parent ea02856bc1
2 changed files with 26 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -48,7 +48,7 @@ If you find `buku` useful, please consider donating via PayPal.
 - Delete all bookmarks from DB
 - Add a bookmark at N<sup>th</sup> index, to fill deleted bookmark indices
 - Secure parameterized SQLite3 queries to access database
- Handle multiple HTTP redirections (reports redireted URL and IP blocking)
+- Handle multiple HTTP redirections (reports redireted URL, loops, IP blocking)
 - Unicode in URL works
 - UTF-8 request and response, page character set detection
 - Works with Python 3.x
--- a/30
+++ b/30
@ -140,6 +140,13 @@ def initdb():
 # Get page response data
 def getPageResp(url, redir=False):
    if url.find("%20") != -1:
        url = unquote(url)
        url = url.replace(" ", "%20")
    else:
        url = unquote(url)
    print("unquote: %s" % url)
    if url.find("https://") >= 0: # Secure connection
        server = url[8:]
        marker = server.find("/")
@ -161,9 +168,9 @@ def getPageResp(url, redir=False):
    if debug:
        print("server: [%s]" % server)
-        print("URL: [%s]" % quote(unquote(url)))
+        print("URL: [%s]" % url)
-    urlconn.request("GET", quote(unquote(url)))
+    urlconn.request("GET", url)
    resp = urlconn.getresponse()
    return (resp, urlconn)
@ -185,16 +192,25 @@ def fetchTitle(url):
                getTitleData(resp)
                break
            if resp.status in (301,302,):
-                print("\x1b[1mREDIRECTION:\x1b[21m %s" % resp.getheader('location', ''))
+                redirurl = urljoin(url, resp.getheader('location', ''))
                print("\x1b[1mREDIRECTION:\x1b[21m %s" % url)
-                url = urljoin(url, resp.getheader('location', ''))
+                if redirurl.find("sorry/IndexRedirect?") >= 0:          # graecefully handle Google blocks
                if url.find("sorry/IndexRedirect?") >= 0:          # graecefully handle Google blocks
                    print("ERROR: Connection blocked due to unusual activity.")
                    break
-                else:
+
                marker = redirurl.find("redirectUrl=")
                if marker != -1:
                    redirurl = redirurl[marker + 12:]
                # break same URL redirection loop
                if url == redirurl:
                    print("ERROR: Detected repeated reirection to same URL")
                    break
                url = redirurl
                urlconn.close()
                resp, urlconn = getPageResp(url, True)
                    continue
            else:
                print("ERROR in response:", str(resp.status), ": ", resp.reason)
                break