Add support for the Gemini protocol

This commit is contained in:
pjht 2021-06-19 21:42:11 -05:00
parent 6ad0a52f82
commit 417ac7f654
3 changed files with 104 additions and 48 deletions

View File

@ -77,6 +77,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
- Multi-threaded full DB refresh, manual encryption support
- Shell completion scripts, man page with examples
- Privacy-aware (no unconfirmed user data collection)
- Supports the Gemini protocol
### Installation
@ -88,6 +89,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
| HTTPS | certifi, urllib3 |
| Encryption | cryptography |
| HTML | beautifulsoup4, html5lib |
| Gemini | agunua |
To copy URL to clipboard `buku` uses `xsel` (or `xclip`) on Linux, `pbcopy` (default installed) on OS X, `clip` (default installed) on Windows, `termux-clipboard` on Termux (terminal emulation for Android), `wl-copy` on Wayland. If X11 is missing, GNU Screen or tmux copy-paste buffers are recognized.

149
buku
View File

@ -49,6 +49,7 @@ import urllib3
from urllib3.exceptions import LocationParseError
from urllib3.util import parse_url, make_headers, Retry
from bs4 import BeautifulSoup
import Agunua
# note catch ModuleNotFoundError instead Exception
# when python3.5 not supported
try:
@ -3406,7 +3407,7 @@ def is_bad_url(url):
def is_nongeneric_url(url):
"""Returns True for URLs which are non-http and non-generic.
"""Returns True for URLs which are non-http/gemini and non-generic.
Parameters
----------
@ -3552,7 +3553,7 @@ def parse_decoded_page(page):
return (title, desc, keys)
def get_data_from_page(resp):
def get_data_from_http_page(resp):
"""Detect HTTP response encoding and invoke parser with decoded data.
Parameters
@ -3569,7 +3570,7 @@ def get_data_from_page(resp):
try:
soup = BeautifulSoup(resp.data, 'html.parser')
except Exception as e:
LOGERR('get_data_from_page(): %s', e)
LOGERR('get_data_from_http_page(): %s', e)
try:
charset = None
@ -3598,6 +3599,25 @@ def get_data_from_page(resp):
LOGERR(e)
return (None, None, None)
def get_title_from_gemini_page(resp):
"""Get the title from a Gemini page
Parameters
----------
resp : Gemini response
Response from server.
Returns
-------
string
The page title
"""
for line in resp.payload.split('\n'):
if line.startswith('# ') or line.startswith('## '):
return line.split(' ', 1)[1]
return None
def gen_headers():
"""Generate headers for network connection."""
@ -3669,6 +3689,22 @@ def network_handler(
(title, description, tags, recognized mime, bad url).
"""
GEMINI_CODE_READABLE = {
'30': 'Too Many Redirects',
'31': 'Too Many Redirects',
'40': 'Temporary Failure',
'41': 'Server Unavailable',
'42': 'CGI Error',
'43': 'Proxy Error',
'44': 'Slow Down',
'50': 'Permanent Failure',
'51': 'Not Found',
'52': 'Gone',
'53': 'Proxy Request Refused',
'59': 'Bad Request',
'60': 'Client Certificate Required'
}
page_title = None
page_desc = None
page_keys = None
@ -3677,55 +3713,72 @@ def network_handler(
if is_nongeneric_url(url) or is_bad_url(url):
return (None, None, None, 0, 1)
if is_ignored_mime(url) or http_head:
method = 'HEAD'
else:
method = 'GET'
if not MYHEADERS:
gen_headers()
try:
manager = get_PoolManager()
while True:
resp = manager.request(method, url, retries=Retry(redirect=10))
if resp.status == 200:
if method == 'GET':
page_title, page_desc, page_keys = get_data_from_page(resp)
elif resp.status == 403 and url.endswith('/'):
# HTTP response Forbidden
# Handle URLs in the form of https://www.domain.com/
# which fail when trying to fetch resource '/'
# retry without trailing '/'
LOGDBG('Received status 403: retrying...')
# Remove trailing /
url = url[:-1]
resp.close()
continue
if parse_url(url).scheme == 'gemini':
# Required to prevent a too many redirects error from being thrown
if url[-1] != '/':
url = url + '/'
u = Agunua.GeminiUri(url, insecure=True, get_content=True, follow_redirect=True,
redirect_depth=5, maxlines=None, maxsize=None)
if u.network_success:
if u.status_code[0] == '2':
page_title = get_title_from_gemini_page(u)
elif u.status_code[0] == '1':
pass # Input responses have no default title
else:
LOGERR('[%s] %s', resp.status, resp.reason)
LOGERR('[%s] %s', u.status_code, GEMINI_CODE_READABLE[u.status_code])
else:
LOGERR('network_handler(): %s', u.error)
sys.exit(1)
else:
if is_ignored_mime(url) or http_head:
method = 'HEAD'
else:
method = 'GET'
if resp:
resp.close()
if not MYHEADERS:
gen_headers()
break
except Exception as e:
LOGERR('network_handler(): %s', e)
exception = True
finally:
if manager:
manager.clear()
if exception:
return (None, None, None, 0, 0)
if method == 'HEAD':
return ('', '', '', 1, 0)
if page_title is None:
return ('', page_desc, page_keys, 0, 0)
try:
manager = get_PoolManager()
return (page_title, page_desc, page_keys, 0, 0)
while True:
resp = manager.request(method, url, retries=Retry(redirect=10))
if resp.status == 200:
if method == 'GET':
page_title, page_desc, page_keys = get_data_from_http_page(resp)
elif resp.status == 403 and url.endswith('/'):
# HTTP response Forbidden
# Handle URLs in the form of https://www.domain.com/
# which fail when trying to fetch resource '/'
# retry without trailing '/'
LOGDBG('Received status 403: retrying...')
# Remove trailing /
url = url[:-1]
resp.close()
continue
else:
LOGERR('[%s] %s', resp.status, resp.reason)
if resp:
resp.close()
break
except Exception as e:
LOGERR('network_handler(): %s', e)
exception = True
finally:
if manager:
manager.clear()
if exception:
return (None, None, None, 0, 0)
if method == 'HEAD':
return ('', '', '', 1, 0)
if page_title is None:
return ('', page_desc, page_keys, 0, 0)
return (page_title, page_desc, page_keys, 0, 0)
def parse_tags(keywords=[]):

View File

@ -77,6 +77,7 @@ setup(
'cryptography>=1.2.3',
'urllib3>=1.23',
'html5lib>=1.0.1',
'agunua>=1.4'
],
packages=find_packages(exclude=['tests']),
include_package_data=True,