Handle bad URLs, fetch only HEAD for known mimes

Add API to check URL validity Add API to detect URLs to some non-page mimes which sans title network_handler() returns more meaningful tuple Handle interrupt during multiple indices and/or range update With urllib3 URLs without preceding http(s) can be handled
2016-11-08 23:02:45 +05:30 · 2016-11-08 23:02:45 +05:30 · ac645e8140
commit ac645e8140
parent c40cd302bf
3 changed files with 103 additions and 37 deletions
--- a/README.md
+++ b/README.md
@ -216,7 +216,6 @@ Shell completion scripts for Bash, Fish and Zsh can be found in respective subdi
  - **$XDG_DATA_HOME/buku/bookmarks.db**, if XDG_DATA_HOME is defined (first preference) or
  - **$HOME/.local/share/buku/bookmarks.db**, if HOME is defined (second preference) or
  - the **current directory**.
 - It's  advisable  to copy URLs directly from the browser address bar, i.e., along with the leading `http://` or `https://` token. buku looks up title data (found within <title></title> tags of HTML) from the web ONLY for fully-formed HTTP(S) URLs.
 - If the URL contains characters like `;`, `&` or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes (`'`/`"`).
 - URLs are unique in DB. The same URL cannot be added twice.
 - Bookmarks with immutable titles are listed with bold `(L)` after the URL.
@ -352,6 +351,7 @@ Applies to --title and --tag too. URL cannot be deleted without deleting the boo
 10. **Update** or refresh **full DB** with page titles from the web:
        $ buku -u
        $ buku -u --tacit (show only failures and exceptions)
 This operation does not modify the indexes, URLs, tags or comments. Only title is refreshed if fetched title is non-empty.
 11. **Delete** bookmark at index 15012014:
--- a/buku.1
+++ b/buku.1
@ -26,8 +26,6 @@ The SQLite3 database file is stored in:
  - \fI$HOME/.local/share/buku/bookmarks.db\fR, if HOME is defined (second preference) or
  - the \fIcurrent directory\fR.
 .PP
 It's advisable to copy URLs directly from the browser address bar, i.e., along with the leading 'http://' or 'https://' token. buku looks up title data (found within <title></title> tags of HTML) from the web ONLY for fully-formed HTTP(S) URLs.
 .PP
 If the URL contains characters like ';', '&' or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes ('/").
 .PP
 URLs are unique in DB. The same URL cannot be added twice.
@ -291,6 +289,7 @@ Applies to --title and --tag too. URL cannot be deleted without deleting the boo
 .EX
 .IP
 .B buku -u
 .B buku -u --tacit (show only failures and exceptions)
 .EE
 .PP
 .IP "" 4
--- a/buku.py
+++ b/buku.py
@ -25,12 +25,13 @@ import argparse
 import webbrowser
 import html.parser as HTMLParser
 import urllib3
-from urllib.parse import unquote
+from urllib.parse import urlparse, unquote
 import signal
 import json
 import logging
 import inspect
 import atexit
 try:
    import readline
    readline
@ -50,7 +51,7 @@ tagsearch = False  # Search bookmarks by tag
 title_data = None  # Title fetched from a webpage
 interrupted = False  # Received SIGINT
 DELIM = ','  # Delimiter used to store tags in DB
-SKIP_MIMES = {'.pdf', '.txt'}  # Skip connecting to web for these mimes
+SKIP_MIMES = {'.pdf', '.txt'}
 http_handler = None  # urllib3 PoolManager handler
 # Crypto globals
@ -445,10 +446,15 @@ class BukuDb:
        if title_in is not None:
            meta = title_in
        else:
-            meta = network_handler(url)
+            meta, mime, bad = network_handler(url)
-            if meta == '':
+            if bad:
-                print('\x1B[91mTitle: []\x1B[0m\n')
+                print('\x1b[91mMalformed URL\x1b[0m\n')
-            logger.debug('Title: [%s]', meta)
+            elif mime:
                logger.debug('mime recognized, only HEAD fetch attempted\n')
            elif meta == '':
                print('\x1b[91mTitle: []\x1b[0m\n')
            else:
                logger.debug('Title: [%s]', meta)
        # Process tags
        if tags_in is None:
@ -627,23 +633,29 @@ class BukuDb:
        #    if URL is passed, update the title from web using the URL
        # 4. if no other argument (url, tag, comment, immutable) passed,
        #    update title from web using DB URL (if title is mutable)
-        meta = None
+        title_to_insert = None
        if title_in is not None:
-            meta = title_in
+            title_to_insert = title_in
        elif url != '':
-            meta = network_handler(url)
+            title_to_insert, mime, bad = network_handler(url)
-            if meta == '':
+            if bad:
-                print('\x1B[91mTitle: []\x1B[0m')
+                print('\x1b[91mMalformed URL\x1b[0m\n')
-            logger.debug('Title: [%s]', meta)
+            elif mime:
                print('\x1b[91mSkipped mime\x1b[0m\n')
            elif title_to_insert == '':
                print('\x1b[91mTitle: []\x1b[0m')
            else:
                logger.debug('Title: [%s]', title_to_insert)
        elif not to_update and not (append_tag or delete_tag):
            ret = self.refreshdb(index)
            if ret and index and self.chatty:
                pass
                self.print_bm(index)
            return ret
-        if meta is not None:
+        if title_to_insert is not None:
            query = '%s metadata = ?,' % query
-            arguments += (meta,)
+            arguments += (title_to_insert,)
            to_update = True
        if not to_update:       # Nothing to update
@ -700,17 +712,15 @@ class BukuDb:
        query = 'UPDATE bookmarks SET metadata = ? WHERE id = ?'
        for row in resultset:
-            title = network_handler(row[1])
+            title, mime, bad = network_handler(row[1])
-            if title == '':
+            if bad:
-                skip = False
+                print('\x1b[1mIndex %d: malformed URL\x1b[0m\n' % row[0])
-                for mime in SKIP_MIMES:
+                continue
-                    if row[1].lower().endswith(mime):
+            elif mime:
-                        skip = True
+                print('\x1b[1mIndex %d: skipped mime\x1b[0m\n' % row[0])
-                        break
+                continue
-                if skip:
+            elif title == '':
-                    print('\x1b[1mIndex %d: skipped mime\x1B[0m\n' % row[0])
+                print('\x1b[1mIndex %d: no title\x1b[0m\n' % row[0])
                else:
                    print('\x1b[1mIndex %d: no title\x1B[0m\n' % row[0])
                continue
            self.cur.execute(query, (title, row[0],))
@ -1306,8 +1316,57 @@ class BukuDb:
 # Generic functions
 def is_bad_url(url):
    '''Check if URL is malformed
    This API is not bulletproof but works in most cases.
    :param url: URL to scan
    :return: True or False
    '''
    # Get the netloc token
    netloc = urlparse(url).netloc
    if not netloc:
        # Try of prepend '//' and get netloc
        netloc = urlparse('//' + url).netloc
        if not netloc:
            return True
    # netloc cannot start with a '.'
    if netloc.startswith('.'):
        return True
    # netloc should have at least one '.'
    index = netloc.rfind('.')
    if index < 0:
        return True
    # '.' can be followed by 3 chars at most
    revindex = len(netloc) - 1 - index
    if revindex > 0 and revindex < 4:
        return False
    return True
 def is_ignored_mime(url):
    '''Check if URL links to ignored mime
    Only a 'HEAD' request is made for these URLs
    :param url: URL to scan
    :return: True or False
    '''
    for mime in SKIP_MIMES:
        if url.lower().endswith(mime):
            return True
    return False
 def get_page_title(resp):
    '''Invoke HTML parser and extract title from HTTP response
    The page title is set in a global variable
    :param resp: HTTP(S) GET response
    '''
@ -1327,20 +1386,20 @@ def network_handler(url):
    '''Handle server connection and redirections
    :param url: URL to fetch
-    :return: page title, or empty string, if not found
+    :return: {title, recognized mime, bad url} tuple
    '''
    global title_data, http_handler
    title_data = None
    resp = None
    method = 'GET'
-    if not (url.startswith('http://') or url.startswith('https://')):
+    if is_bad_url(url):
-        return ''
+        return ('', 0, 1)
-    for mime in SKIP_MIMES:
+    if is_ignored_mime(url):
-        if url.lower().endswith(mime):
+        method = 'HEAD'
            return ''
    if not http_handler:
        http_handler = urllib3.PoolManager()
@ -1348,10 +1407,11 @@ def network_handler(url):
    try:
        while True:
            resp = http_handler.request(
-                                'GET', url, timeout=40,
+                                method, url, timeout=40,
                                headers={'Accept-Encoding': 'gzip,deflate',
                                         'DNT': '1'}
                                       )
            if resp.status == 200:
                get_page_title(resp)
                break
@ -1378,9 +1438,11 @@ def network_handler(url):
    finally:
        if resp:
            resp.release_conn()
        if method == 'HEAD':
            return ('', 1, 0)
        if title_data is None:
-            return ''
+            return ('', 0, 0)
-        return title_data.strip().replace('\n', '')
+        return (title_data.strip().replace('\n', ''), 0, 0)
 def parse_tags(keywords=None):
@ -2014,6 +2076,11 @@ def main():
                        for _id in range(lower, upper + 1):
                            bdb.update_bm(_id, url_in, title_in, tags,
                                          description, append, delete)
                            if interrupted:
                                break
                if interrupted:
                    break
    # Search operations
    search_results = None