Handle bad URLs, fetch only HEAD for known mimes
Add API to check URL validity Add API to detect URLs to some non-page mimes which sans title network_handler() returns more meaningful tuple Handle interrupt during multiple indices and/or range update With urllib3 URLs without preceding http(s) can be handled
This commit is contained in:
parent
c40cd302bf
commit
ac645e8140
@ -216,7 +216,6 @@ Shell completion scripts for Bash, Fish and Zsh can be found in respective subdi
|
|||||||
- **$XDG_DATA_HOME/buku/bookmarks.db**, if XDG_DATA_HOME is defined (first preference) or
|
- **$XDG_DATA_HOME/buku/bookmarks.db**, if XDG_DATA_HOME is defined (first preference) or
|
||||||
- **$HOME/.local/share/buku/bookmarks.db**, if HOME is defined (second preference) or
|
- **$HOME/.local/share/buku/bookmarks.db**, if HOME is defined (second preference) or
|
||||||
- the **current directory**.
|
- the **current directory**.
|
||||||
- It's advisable to copy URLs directly from the browser address bar, i.e., along with the leading `http://` or `https://` token. buku looks up title data (found within <title></title> tags of HTML) from the web ONLY for fully-formed HTTP(S) URLs.
|
|
||||||
- If the URL contains characters like `;`, `&` or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes (`'`/`"`).
|
- If the URL contains characters like `;`, `&` or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes (`'`/`"`).
|
||||||
- URLs are unique in DB. The same URL cannot be added twice.
|
- URLs are unique in DB. The same URL cannot be added twice.
|
||||||
- Bookmarks with immutable titles are listed with bold `(L)` after the URL.
|
- Bookmarks with immutable titles are listed with bold `(L)` after the URL.
|
||||||
@ -352,6 +351,7 @@ Applies to --title and --tag too. URL cannot be deleted without deleting the boo
|
|||||||
10. **Update** or refresh **full DB** with page titles from the web:
|
10. **Update** or refresh **full DB** with page titles from the web:
|
||||||
|
|
||||||
$ buku -u
|
$ buku -u
|
||||||
|
$ buku -u --tacit (show only failures and exceptions)
|
||||||
This operation does not modify the indexes, URLs, tags or comments. Only title is refreshed if fetched title is non-empty.
|
This operation does not modify the indexes, URLs, tags or comments. Only title is refreshed if fetched title is non-empty.
|
||||||
11. **Delete** bookmark at index 15012014:
|
11. **Delete** bookmark at index 15012014:
|
||||||
|
|
||||||
|
3
buku.1
3
buku.1
@ -26,8 +26,6 @@ The SQLite3 database file is stored in:
|
|||||||
- \fI$HOME/.local/share/buku/bookmarks.db\fR, if HOME is defined (second preference) or
|
- \fI$HOME/.local/share/buku/bookmarks.db\fR, if HOME is defined (second preference) or
|
||||||
- the \fIcurrent directory\fR.
|
- the \fIcurrent directory\fR.
|
||||||
.PP
|
.PP
|
||||||
It's advisable to copy URLs directly from the browser address bar, i.e., along with the leading 'http://' or 'https://' token. buku looks up title data (found within <title></title> tags of HTML) from the web ONLY for fully-formed HTTP(S) URLs.
|
|
||||||
.PP
|
|
||||||
If the URL contains characters like ';', '&' or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes ('/").
|
If the URL contains characters like ';', '&' or brackets they may be interpreted specially by the shell. To avoid it, add the URL within single or double quotes ('/").
|
||||||
.PP
|
.PP
|
||||||
URLs are unique in DB. The same URL cannot be added twice.
|
URLs are unique in DB. The same URL cannot be added twice.
|
||||||
@ -291,6 +289,7 @@ Applies to --title and --tag too. URL cannot be deleted without deleting the boo
|
|||||||
.EX
|
.EX
|
||||||
.IP
|
.IP
|
||||||
.B buku -u
|
.B buku -u
|
||||||
|
.B buku -u --tacit (show only failures and exceptions)
|
||||||
.EE
|
.EE
|
||||||
.PP
|
.PP
|
||||||
.IP "" 4
|
.IP "" 4
|
||||||
|
133
buku.py
133
buku.py
@ -25,12 +25,13 @@ import argparse
|
|||||||
import webbrowser
|
import webbrowser
|
||||||
import html.parser as HTMLParser
|
import html.parser as HTMLParser
|
||||||
import urllib3
|
import urllib3
|
||||||
from urllib.parse import unquote
|
from urllib.parse import urlparse, unquote
|
||||||
import signal
|
import signal
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import inspect
|
import inspect
|
||||||
import atexit
|
import atexit
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import readline
|
import readline
|
||||||
readline
|
readline
|
||||||
@ -50,7 +51,7 @@ tagsearch = False # Search bookmarks by tag
|
|||||||
title_data = None # Title fetched from a webpage
|
title_data = None # Title fetched from a webpage
|
||||||
interrupted = False # Received SIGINT
|
interrupted = False # Received SIGINT
|
||||||
DELIM = ',' # Delimiter used to store tags in DB
|
DELIM = ',' # Delimiter used to store tags in DB
|
||||||
SKIP_MIMES = {'.pdf', '.txt'} # Skip connecting to web for these mimes
|
SKIP_MIMES = {'.pdf', '.txt'}
|
||||||
http_handler = None # urllib3 PoolManager handler
|
http_handler = None # urllib3 PoolManager handler
|
||||||
|
|
||||||
# Crypto globals
|
# Crypto globals
|
||||||
@ -445,9 +446,14 @@ class BukuDb:
|
|||||||
if title_in is not None:
|
if title_in is not None:
|
||||||
meta = title_in
|
meta = title_in
|
||||||
else:
|
else:
|
||||||
meta = network_handler(url)
|
meta, mime, bad = network_handler(url)
|
||||||
if meta == '':
|
if bad:
|
||||||
print('\x1B[91mTitle: []\x1B[0m\n')
|
print('\x1b[91mMalformed URL\x1b[0m\n')
|
||||||
|
elif mime:
|
||||||
|
logger.debug('mime recognized, only HEAD fetch attempted\n')
|
||||||
|
elif meta == '':
|
||||||
|
print('\x1b[91mTitle: []\x1b[0m\n')
|
||||||
|
else:
|
||||||
logger.debug('Title: [%s]', meta)
|
logger.debug('Title: [%s]', meta)
|
||||||
|
|
||||||
# Process tags
|
# Process tags
|
||||||
@ -627,23 +633,29 @@ class BukuDb:
|
|||||||
# if URL is passed, update the title from web using the URL
|
# if URL is passed, update the title from web using the URL
|
||||||
# 4. if no other argument (url, tag, comment, immutable) passed,
|
# 4. if no other argument (url, tag, comment, immutable) passed,
|
||||||
# update title from web using DB URL (if title is mutable)
|
# update title from web using DB URL (if title is mutable)
|
||||||
meta = None
|
title_to_insert = None
|
||||||
if title_in is not None:
|
if title_in is not None:
|
||||||
meta = title_in
|
title_to_insert = title_in
|
||||||
elif url != '':
|
elif url != '':
|
||||||
meta = network_handler(url)
|
title_to_insert, mime, bad = network_handler(url)
|
||||||
if meta == '':
|
if bad:
|
||||||
print('\x1B[91mTitle: []\x1B[0m')
|
print('\x1b[91mMalformed URL\x1b[0m\n')
|
||||||
logger.debug('Title: [%s]', meta)
|
elif mime:
|
||||||
|
print('\x1b[91mSkipped mime\x1b[0m\n')
|
||||||
|
elif title_to_insert == '':
|
||||||
|
print('\x1b[91mTitle: []\x1b[0m')
|
||||||
|
else:
|
||||||
|
logger.debug('Title: [%s]', title_to_insert)
|
||||||
elif not to_update and not (append_tag or delete_tag):
|
elif not to_update and not (append_tag or delete_tag):
|
||||||
ret = self.refreshdb(index)
|
ret = self.refreshdb(index)
|
||||||
if ret and index and self.chatty:
|
if ret and index and self.chatty:
|
||||||
|
pass
|
||||||
self.print_bm(index)
|
self.print_bm(index)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
if meta is not None:
|
if title_to_insert is not None:
|
||||||
query = '%s metadata = ?,' % query
|
query = '%s metadata = ?,' % query
|
||||||
arguments += (meta,)
|
arguments += (title_to_insert,)
|
||||||
to_update = True
|
to_update = True
|
||||||
|
|
||||||
if not to_update: # Nothing to update
|
if not to_update: # Nothing to update
|
||||||
@ -700,17 +712,15 @@ class BukuDb:
|
|||||||
|
|
||||||
query = 'UPDATE bookmarks SET metadata = ? WHERE id = ?'
|
query = 'UPDATE bookmarks SET metadata = ? WHERE id = ?'
|
||||||
for row in resultset:
|
for row in resultset:
|
||||||
title = network_handler(row[1])
|
title, mime, bad = network_handler(row[1])
|
||||||
if title == '':
|
if bad:
|
||||||
skip = False
|
print('\x1b[1mIndex %d: malformed URL\x1b[0m\n' % row[0])
|
||||||
for mime in SKIP_MIMES:
|
continue
|
||||||
if row[1].lower().endswith(mime):
|
elif mime:
|
||||||
skip = True
|
print('\x1b[1mIndex %d: skipped mime\x1b[0m\n' % row[0])
|
||||||
break
|
continue
|
||||||
if skip:
|
elif title == '':
|
||||||
print('\x1b[1mIndex %d: skipped mime\x1B[0m\n' % row[0])
|
print('\x1b[1mIndex %d: no title\x1b[0m\n' % row[0])
|
||||||
else:
|
|
||||||
print('\x1b[1mIndex %d: no title\x1B[0m\n' % row[0])
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.cur.execute(query, (title, row[0],))
|
self.cur.execute(query, (title, row[0],))
|
||||||
@ -1306,8 +1316,57 @@ class BukuDb:
|
|||||||
|
|
||||||
# Generic functions
|
# Generic functions
|
||||||
|
|
||||||
|
def is_bad_url(url):
|
||||||
|
'''Check if URL is malformed
|
||||||
|
This API is not bulletproof but works in most cases.
|
||||||
|
|
||||||
|
:param url: URL to scan
|
||||||
|
:return: True or False
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Get the netloc token
|
||||||
|
netloc = urlparse(url).netloc
|
||||||
|
if not netloc:
|
||||||
|
# Try of prepend '//' and get netloc
|
||||||
|
netloc = urlparse('//' + url).netloc
|
||||||
|
if not netloc:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# netloc cannot start with a '.'
|
||||||
|
if netloc.startswith('.'):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# netloc should have at least one '.'
|
||||||
|
index = netloc.rfind('.')
|
||||||
|
if index < 0:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# '.' can be followed by 3 chars at most
|
||||||
|
revindex = len(netloc) - 1 - index
|
||||||
|
if revindex > 0 and revindex < 4:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def is_ignored_mime(url):
|
||||||
|
'''Check if URL links to ignored mime
|
||||||
|
Only a 'HEAD' request is made for these URLs
|
||||||
|
|
||||||
|
:param url: URL to scan
|
||||||
|
:return: True or False
|
||||||
|
'''
|
||||||
|
|
||||||
|
for mime in SKIP_MIMES:
|
||||||
|
if url.lower().endswith(mime):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_page_title(resp):
|
def get_page_title(resp):
|
||||||
'''Invoke HTML parser and extract title from HTTP response
|
'''Invoke HTML parser and extract title from HTTP response
|
||||||
|
The page title is set in a global variable
|
||||||
|
|
||||||
:param resp: HTTP(S) GET response
|
:param resp: HTTP(S) GET response
|
||||||
'''
|
'''
|
||||||
@ -1327,20 +1386,20 @@ def network_handler(url):
|
|||||||
'''Handle server connection and redirections
|
'''Handle server connection and redirections
|
||||||
|
|
||||||
:param url: URL to fetch
|
:param url: URL to fetch
|
||||||
:return: page title, or empty string, if not found
|
:return: {title, recognized mime, bad url} tuple
|
||||||
'''
|
'''
|
||||||
|
|
||||||
global title_data, http_handler
|
global title_data, http_handler
|
||||||
|
|
||||||
title_data = None
|
title_data = None
|
||||||
resp = None
|
resp = None
|
||||||
|
method = 'GET'
|
||||||
|
|
||||||
if not (url.startswith('http://') or url.startswith('https://')):
|
if is_bad_url(url):
|
||||||
return ''
|
return ('', 0, 1)
|
||||||
|
|
||||||
for mime in SKIP_MIMES:
|
if is_ignored_mime(url):
|
||||||
if url.lower().endswith(mime):
|
method = 'HEAD'
|
||||||
return ''
|
|
||||||
|
|
||||||
if not http_handler:
|
if not http_handler:
|
||||||
http_handler = urllib3.PoolManager()
|
http_handler = urllib3.PoolManager()
|
||||||
@ -1348,10 +1407,11 @@ def network_handler(url):
|
|||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
resp = http_handler.request(
|
resp = http_handler.request(
|
||||||
'GET', url, timeout=40,
|
method, url, timeout=40,
|
||||||
headers={'Accept-Encoding': 'gzip,deflate',
|
headers={'Accept-Encoding': 'gzip,deflate',
|
||||||
'DNT': '1'}
|
'DNT': '1'}
|
||||||
)
|
)
|
||||||
|
|
||||||
if resp.status == 200:
|
if resp.status == 200:
|
||||||
get_page_title(resp)
|
get_page_title(resp)
|
||||||
break
|
break
|
||||||
@ -1378,9 +1438,11 @@ def network_handler(url):
|
|||||||
finally:
|
finally:
|
||||||
if resp:
|
if resp:
|
||||||
resp.release_conn()
|
resp.release_conn()
|
||||||
|
if method == 'HEAD':
|
||||||
|
return ('', 1, 0)
|
||||||
if title_data is None:
|
if title_data is None:
|
||||||
return ''
|
return ('', 0, 0)
|
||||||
return title_data.strip().replace('\n', '')
|
return (title_data.strip().replace('\n', ''), 0, 0)
|
||||||
|
|
||||||
|
|
||||||
def parse_tags(keywords=None):
|
def parse_tags(keywords=None):
|
||||||
@ -2014,6 +2076,11 @@ def main():
|
|||||||
for _id in range(lower, upper + 1):
|
for _id in range(lower, upper + 1):
|
||||||
bdb.update_bm(_id, url_in, title_in, tags,
|
bdb.update_bm(_id, url_in, title_in, tags,
|
||||||
description, append, delete)
|
description, append, delete)
|
||||||
|
if interrupted:
|
||||||
|
break
|
||||||
|
|
||||||
|
if interrupted:
|
||||||
|
break
|
||||||
|
|
||||||
# Search operations
|
# Search operations
|
||||||
search_results = None
|
search_results = None
|
||||||
|
Loading…
x
Reference in New Issue
Block a user