Handle bad URLs, fetch only HEAD for known mimes

Add API to check URL validity
Add API to detect URLs to some non-page mimes which sans title
network_handler() returns more meaningful tuple
Handle interrupt during multiple indices and/or range update
With urllib3 URLs without preceding http(s) can be handled
This commit is contained in:
Arun Prakash Jana 2016-11-08 23:02:45 +05:30
parent c40cd302bf
commit ac645e8140
No known key found for this signature in database
GPG Key ID: A75979F35C080412
3 changed files with 103 additions and 37 deletions

View File

@ -216,7 +216,6 @@ Shell completion scripts for Bash, Fish and Zsh can be found in respective subdi
- **$XDG_DATA_HOME/buku/bookmarks.db**, if XDG_DATA_HOME is defined (first preference) or
- **$HOME/.local/share/buku/bookmarks.db**, if HOME is defined (second preference) or
- the **current directory**.
- It's advisable to copy URLs directly from the browser address bar, i.e., along with the leading `http://` or `https://` token. buku looks up title data (found within <title></title> tags of HTML) from the web ONLY for fully-formed HTTP(S) URLs.
- If the URL contains characters like `;`, `&` or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes (`'`/`"`).
- URLs are unique in DB. The same URL cannot be added twice.
- Bookmarks with immutable titles are listed with bold `(L)` after the URL.
@ -352,6 +351,7 @@ Applies to --title and --tag too. URL cannot be deleted without deleting the boo
10. **Update** or refresh **full DB** with page titles from the web:
$ buku -u
$ buku -u --tacit (show only failures and exceptions)
This operation does not modify the indexes, URLs, tags or comments. Only title is refreshed if fetched title is non-empty.
11. **Delete** bookmark at index 15012014:

3
buku.1
View File

@ -26,8 +26,6 @@ The SQLite3 database file is stored in:
- \fI$HOME/.local/share/buku/bookmarks.db\fR, if HOME is defined (second preference) or
- the \fIcurrent directory\fR.
.PP
It's advisable to copy URLs directly from the browser address bar, i.e., along with the leading 'http://' or 'https://' token. buku looks up title data (found within <title></title> tags of HTML) from the web ONLY for fully-formed HTTP(S) URLs.
.PP
If the URL contains characters like ';', '&' or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes ('/").
.PP
URLs are unique in DB. The same URL cannot be added twice.
@ -291,6 +289,7 @@ Applies to --title and --tag too. URL cannot be deleted without deleting the boo
.EX
.IP
.B buku -u
.B buku -u --tacit (show only failures and exceptions)
.EE
.PP
.IP "" 4

133
buku.py
View File

@ -25,12 +25,13 @@ import argparse
import webbrowser
import html.parser as HTMLParser
import urllib3
from urllib.parse import unquote
from urllib.parse import urlparse, unquote
import signal
import json
import logging
import inspect
import atexit
try:
import readline
readline
@ -50,7 +51,7 @@ tagsearch = False # Search bookmarks by tag
title_data = None # Title fetched from a webpage
interrupted = False # Received SIGINT
DELIM = ',' # Delimiter used to store tags in DB
SKIP_MIMES = {'.pdf', '.txt'} # Skip connecting to web for these mimes
SKIP_MIMES = {'.pdf', '.txt'}
http_handler = None # urllib3 PoolManager handler
# Crypto globals
@ -445,9 +446,14 @@ class BukuDb:
if title_in is not None:
meta = title_in
else:
meta = network_handler(url)
if meta == '':
print('\x1B[91mTitle: []\x1B[0m\n')
meta, mime, bad = network_handler(url)
if bad:
print('\x1b[91mMalformed URL\x1b[0m\n')
elif mime:
logger.debug('mime recognized, only HEAD fetch attempted\n')
elif meta == '':
print('\x1b[91mTitle: []\x1b[0m\n')
else:
logger.debug('Title: [%s]', meta)
# Process tags
@ -627,23 +633,29 @@ class BukuDb:
# if URL is passed, update the title from web using the URL
# 4. if no other argument (url, tag, comment, immutable) passed,
# update title from web using DB URL (if title is mutable)
meta = None
title_to_insert = None
if title_in is not None:
meta = title_in
title_to_insert = title_in
elif url != '':
meta = network_handler(url)
if meta == '':
print('\x1B[91mTitle: []\x1B[0m')
logger.debug('Title: [%s]', meta)
title_to_insert, mime, bad = network_handler(url)
if bad:
print('\x1b[91mMalformed URL\x1b[0m\n')
elif mime:
print('\x1b[91mSkipped mime\x1b[0m\n')
elif title_to_insert == '':
print('\x1b[91mTitle: []\x1b[0m')
else:
logger.debug('Title: [%s]', title_to_insert)
elif not to_update and not (append_tag or delete_tag):
ret = self.refreshdb(index)
if ret and index and self.chatty:
pass
self.print_bm(index)
return ret
if meta is not None:
if title_to_insert is not None:
query = '%s metadata = ?,' % query
arguments += (meta,)
arguments += (title_to_insert,)
to_update = True
if not to_update: # Nothing to update
@ -700,17 +712,15 @@ class BukuDb:
query = 'UPDATE bookmarks SET metadata = ? WHERE id = ?'
for row in resultset:
title = network_handler(row[1])
if title == '':
skip = False
for mime in SKIP_MIMES:
if row[1].lower().endswith(mime):
skip = True
break
if skip:
print('\x1b[1mIndex %d: skipped mime\x1B[0m\n' % row[0])
else:
print('\x1b[1mIndex %d: no title\x1B[0m\n' % row[0])
title, mime, bad = network_handler(row[1])
if bad:
print('\x1b[1mIndex %d: malformed URL\x1b[0m\n' % row[0])
continue
elif mime:
print('\x1b[1mIndex %d: skipped mime\x1b[0m\n' % row[0])
continue
elif title == '':
print('\x1b[1mIndex %d: no title\x1b[0m\n' % row[0])
continue
self.cur.execute(query, (title, row[0],))
@ -1306,8 +1316,57 @@ class BukuDb:
# Generic functions
def is_bad_url(url):
'''Check if URL is malformed
This API is not bulletproof but works in most cases.
:param url: URL to scan
:return: True or False
'''
# Get the netloc token
netloc = urlparse(url).netloc
if not netloc:
# Try of prepend '//' and get netloc
netloc = urlparse('//' + url).netloc
if not netloc:
return True
# netloc cannot start with a '.'
if netloc.startswith('.'):
return True
# netloc should have at least one '.'
index = netloc.rfind('.')
if index < 0:
return True
# '.' can be followed by 3 chars at most
revindex = len(netloc) - 1 - index
if revindex > 0 and revindex < 4:
return False
return True
def is_ignored_mime(url):
'''Check if URL links to ignored mime
Only a 'HEAD' request is made for these URLs
:param url: URL to scan
:return: True or False
'''
for mime in SKIP_MIMES:
if url.lower().endswith(mime):
return True
return False
def get_page_title(resp):
'''Invoke HTML parser and extract title from HTTP response
The page title is set in a global variable
:param resp: HTTP(S) GET response
'''
@ -1327,20 +1386,20 @@ def network_handler(url):
'''Handle server connection and redirections
:param url: URL to fetch
:return: page title, or empty string, if not found
:return: {title, recognized mime, bad url} tuple
'''
global title_data, http_handler
title_data = None
resp = None
method = 'GET'
if not (url.startswith('http://') or url.startswith('https://')):
return ''
if is_bad_url(url):
return ('', 0, 1)
for mime in SKIP_MIMES:
if url.lower().endswith(mime):
return ''
if is_ignored_mime(url):
method = 'HEAD'
if not http_handler:
http_handler = urllib3.PoolManager()
@ -1348,10 +1407,11 @@ def network_handler(url):
try:
while True:
resp = http_handler.request(
'GET', url, timeout=40,
method, url, timeout=40,
headers={'Accept-Encoding': 'gzip,deflate',
'DNT': '1'}
)
if resp.status == 200:
get_page_title(resp)
break
@ -1378,9 +1438,11 @@ def network_handler(url):
finally:
if resp:
resp.release_conn()
if method == 'HEAD':
return ('', 1, 0)
if title_data is None:
return ''
return title_data.strip().replace('\n', '')
return ('', 0, 0)
return (title_data.strip().replace('\n', ''), 0, 0)
def parse_tags(keywords=None):
@ -2014,6 +2076,11 @@ def main():
for _id in range(lower, upper + 1):
bdb.update_bm(_id, url_in, title_in, tags,
description, append, delete)
if interrupted:
break
if interrupted:
break
# Search operations
search_results = None