Handle bad URLs, fetch only HEAD for known mimes

Add API to check URL validity
Add API to detect URLs to some non-page mimes which sans title
network_handler() returns more meaningful tuple
Handle interrupt during multiple indices and/or range update
With urllib3 URLs without preceding http(s) can be handled
This commit is contained in:
Arun Prakash Jana 2016-11-08 23:02:45 +05:30
parent c40cd302bf
commit ac645e8140
No known key found for this signature in database
GPG Key ID: A75979F35C080412
3 changed files with 103 additions and 37 deletions

View File

@ -216,7 +216,6 @@ Shell completion scripts for Bash, Fish and Zsh can be found in respective subdi
- **$XDG_DATA_HOME/buku/bookmarks.db**, if XDG_DATA_HOME is defined (first preference) or - **$XDG_DATA_HOME/buku/bookmarks.db**, if XDG_DATA_HOME is defined (first preference) or
- **$HOME/.local/share/buku/bookmarks.db**, if HOME is defined (second preference) or - **$HOME/.local/share/buku/bookmarks.db**, if HOME is defined (second preference) or
- the **current directory**. - the **current directory**.
- It's advisable to copy URLs directly from the browser address bar, i.e., along with the leading `http://` or `https://` token. buku looks up title data (found within <title></title> tags of HTML) from the web ONLY for fully-formed HTTP(S) URLs.
- If the URL contains characters like `;`, `&` or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes (`'`/`"`). - If the URL contains characters like `;`, `&` or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes (`'`/`"`).
- URLs are unique in DB. The same URL cannot be added twice. - URLs are unique in DB. The same URL cannot be added twice.
- Bookmarks with immutable titles are listed with bold `(L)` after the URL. - Bookmarks with immutable titles are listed with bold `(L)` after the URL.
@ -352,6 +351,7 @@ Applies to --title and --tag too. URL cannot be deleted without deleting the boo
10. **Update** or refresh **full DB** with page titles from the web: 10. **Update** or refresh **full DB** with page titles from the web:
$ buku -u $ buku -u
$ buku -u --tacit (show only failures and exceptions)
This operation does not modify the indexes, URLs, tags or comments. Only title is refreshed if fetched title is non-empty. This operation does not modify the indexes, URLs, tags or comments. Only title is refreshed if fetched title is non-empty.
11. **Delete** bookmark at index 15012014: 11. **Delete** bookmark at index 15012014:

3
buku.1
View File

@ -26,8 +26,6 @@ The SQLite3 database file is stored in:
- \fI$HOME/.local/share/buku/bookmarks.db\fR, if HOME is defined (second preference) or - \fI$HOME/.local/share/buku/bookmarks.db\fR, if HOME is defined (second preference) or
- the \fIcurrent directory\fR. - the \fIcurrent directory\fR.
.PP .PP
It's advisable to copy URLs directly from the browser address bar, i.e., along with the leading 'http://' or 'https://' token. buku looks up title data (found within <title></title> tags of HTML) from the web ONLY for fully-formed HTTP(S) URLs.
.PP
If the URL contains characters like ';', '&' or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes ('/"). If the URL contains characters like ';', '&' or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes ('/").
.PP .PP
URLs are unique in DB. The same URL cannot be added twice. URLs are unique in DB. The same URL cannot be added twice.
@ -291,6 +289,7 @@ Applies to --title and --tag too. URL cannot be deleted without deleting the boo
.EX .EX
.IP .IP
.B buku -u .B buku -u
.B buku -u --tacit (show only failures and exceptions)
.EE .EE
.PP .PP
.IP "" 4 .IP "" 4

135
buku.py
View File

@ -25,12 +25,13 @@ import argparse
import webbrowser import webbrowser
import html.parser as HTMLParser import html.parser as HTMLParser
import urllib3 import urllib3
from urllib.parse import unquote from urllib.parse import urlparse, unquote
import signal import signal
import json import json
import logging import logging
import inspect import inspect
import atexit import atexit
try: try:
import readline import readline
readline readline
@ -50,7 +51,7 @@ tagsearch = False # Search bookmarks by tag
title_data = None # Title fetched from a webpage title_data = None # Title fetched from a webpage
interrupted = False # Received SIGINT interrupted = False # Received SIGINT
DELIM = ',' # Delimiter used to store tags in DB DELIM = ',' # Delimiter used to store tags in DB
SKIP_MIMES = {'.pdf', '.txt'} # Skip connecting to web for these mimes SKIP_MIMES = {'.pdf', '.txt'}
http_handler = None # urllib3 PoolManager handler http_handler = None # urllib3 PoolManager handler
# Crypto globals # Crypto globals
@ -445,10 +446,15 @@ class BukuDb:
if title_in is not None: if title_in is not None:
meta = title_in meta = title_in
else: else:
meta = network_handler(url) meta, mime, bad = network_handler(url)
if meta == '': if bad:
print('\x1B[91mTitle: []\x1B[0m\n') print('\x1b[91mMalformed URL\x1b[0m\n')
logger.debug('Title: [%s]', meta) elif mime:
logger.debug('mime recognized, only HEAD fetch attempted\n')
elif meta == '':
print('\x1b[91mTitle: []\x1b[0m\n')
else:
logger.debug('Title: [%s]', meta)
# Process tags # Process tags
if tags_in is None: if tags_in is None:
@ -627,23 +633,29 @@ class BukuDb:
# if URL is passed, update the title from web using the URL # if URL is passed, update the title from web using the URL
# 4. if no other argument (url, tag, comment, immutable) passed, # 4. if no other argument (url, tag, comment, immutable) passed,
# update title from web using DB URL (if title is mutable) # update title from web using DB URL (if title is mutable)
meta = None title_to_insert = None
if title_in is not None: if title_in is not None:
meta = title_in title_to_insert = title_in
elif url != '': elif url != '':
meta = network_handler(url) title_to_insert, mime, bad = network_handler(url)
if meta == '': if bad:
print('\x1B[91mTitle: []\x1B[0m') print('\x1b[91mMalformed URL\x1b[0m\n')
logger.debug('Title: [%s]', meta) elif mime:
print('\x1b[91mSkipped mime\x1b[0m\n')
elif title_to_insert == '':
print('\x1b[91mTitle: []\x1b[0m')
else:
logger.debug('Title: [%s]', title_to_insert)
elif not to_update and not (append_tag or delete_tag): elif not to_update and not (append_tag or delete_tag):
ret = self.refreshdb(index) ret = self.refreshdb(index)
if ret and index and self.chatty: if ret and index and self.chatty:
pass
self.print_bm(index) self.print_bm(index)
return ret return ret
if meta is not None: if title_to_insert is not None:
query = '%s metadata = ?,' % query query = '%s metadata = ?,' % query
arguments += (meta,) arguments += (title_to_insert,)
to_update = True to_update = True
if not to_update: # Nothing to update if not to_update: # Nothing to update
@ -700,17 +712,15 @@ class BukuDb:
query = 'UPDATE bookmarks SET metadata = ? WHERE id = ?' query = 'UPDATE bookmarks SET metadata = ? WHERE id = ?'
for row in resultset: for row in resultset:
title = network_handler(row[1]) title, mime, bad = network_handler(row[1])
if title == '': if bad:
skip = False print('\x1b[1mIndex %d: malformed URL\x1b[0m\n' % row[0])
for mime in SKIP_MIMES: continue
if row[1].lower().endswith(mime): elif mime:
skip = True print('\x1b[1mIndex %d: skipped mime\x1b[0m\n' % row[0])
break continue
if skip: elif title == '':
print('\x1b[1mIndex %d: skipped mime\x1B[0m\n' % row[0]) print('\x1b[1mIndex %d: no title\x1b[0m\n' % row[0])
else:
print('\x1b[1mIndex %d: no title\x1B[0m\n' % row[0])
continue continue
self.cur.execute(query, (title, row[0],)) self.cur.execute(query, (title, row[0],))
@ -1306,8 +1316,57 @@ class BukuDb:
# Generic functions # Generic functions
def is_bad_url(url):
'''Check if URL is malformed
This API is not bulletproof but works in most cases.
:param url: URL to scan
:return: True or False
'''
# Get the netloc token
netloc = urlparse(url).netloc
if not netloc:
# Try of prepend '//' and get netloc
netloc = urlparse('//' + url).netloc
if not netloc:
return True
# netloc cannot start with a '.'
if netloc.startswith('.'):
return True
# netloc should have at least one '.'
index = netloc.rfind('.')
if index < 0:
return True
# '.' can be followed by 3 chars at most
revindex = len(netloc) - 1 - index
if revindex > 0 and revindex < 4:
return False
return True
def is_ignored_mime(url):
'''Check if URL links to ignored mime
Only a 'HEAD' request is made for these URLs
:param url: URL to scan
:return: True or False
'''
for mime in SKIP_MIMES:
if url.lower().endswith(mime):
return True
return False
def get_page_title(resp): def get_page_title(resp):
'''Invoke HTML parser and extract title from HTTP response '''Invoke HTML parser and extract title from HTTP response
The page title is set in a global variable
:param resp: HTTP(S) GET response :param resp: HTTP(S) GET response
''' '''
@ -1327,20 +1386,20 @@ def network_handler(url):
'''Handle server connection and redirections '''Handle server connection and redirections
:param url: URL to fetch :param url: URL to fetch
:return: page title, or empty string, if not found :return: {title, recognized mime, bad url} tuple
''' '''
global title_data, http_handler global title_data, http_handler
title_data = None title_data = None
resp = None resp = None
method = 'GET'
if not (url.startswith('http://') or url.startswith('https://')): if is_bad_url(url):
return '' return ('', 0, 1)
for mime in SKIP_MIMES: if is_ignored_mime(url):
if url.lower().endswith(mime): method = 'HEAD'
return ''
if not http_handler: if not http_handler:
http_handler = urllib3.PoolManager() http_handler = urllib3.PoolManager()
@ -1348,10 +1407,11 @@ def network_handler(url):
try: try:
while True: while True:
resp = http_handler.request( resp = http_handler.request(
'GET', url, timeout=40, method, url, timeout=40,
headers={'Accept-Encoding': 'gzip,deflate', headers={'Accept-Encoding': 'gzip,deflate',
'DNT': '1'} 'DNT': '1'}
) )
if resp.status == 200: if resp.status == 200:
get_page_title(resp) get_page_title(resp)
break break
@ -1378,9 +1438,11 @@ def network_handler(url):
finally: finally:
if resp: if resp:
resp.release_conn() resp.release_conn()
if method == 'HEAD':
return ('', 1, 0)
if title_data is None: if title_data is None:
return '' return ('', 0, 0)
return title_data.strip().replace('\n', '') return (title_data.strip().replace('\n', ''), 0, 0)
def parse_tags(keywords=None): def parse_tags(keywords=None):
@ -2014,6 +2076,11 @@ def main():
for _id in range(lower, upper + 1): for _id in range(lower, upper + 1):
bdb.update_bm(_id, url_in, title_in, tags, bdb.update_bm(_id, url_in, title_in, tags,
description, append, delete) description, append, delete)
if interrupted:
break
if interrupted:
break
# Search operations # Search operations
search_results = None search_results = None