Add support for the Gemini protocol
This commit is contained in:
parent
6ad0a52f82
commit
417ac7f654
@ -77,6 +77,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
|
||||
- Multi-threaded full DB refresh, manual encryption support
|
||||
- Shell completion scripts, man page with examples
|
||||
- Privacy-aware (no unconfirmed user data collection)
|
||||
- Supports the Gemini protocol
|
||||
|
||||
### Installation
|
||||
|
||||
@ -88,6 +89,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
|
||||
| HTTPS | certifi, urllib3 |
|
||||
| Encryption | cryptography |
|
||||
| HTML | beautifulsoup4, html5lib |
|
||||
| Gemini | agunua |
|
||||
|
||||
To copy URL to clipboard `buku` uses `xsel` (or `xclip`) on Linux, `pbcopy` (default installed) on OS X, `clip` (default installed) on Windows, `termux-clipboard` on Termux (terminal emulation for Android), `wl-copy` on Wayland. If X11 is missing, GNU Screen or tmux copy-paste buffers are recognized.
|
||||
|
||||
|
149
buku
149
buku
@ -49,6 +49,7 @@ import urllib3
|
||||
from urllib3.exceptions import LocationParseError
|
||||
from urllib3.util import parse_url, make_headers, Retry
|
||||
from bs4 import BeautifulSoup
|
||||
import Agunua
|
||||
# note catch ModuleNotFoundError instead Exception
|
||||
# when python3.5 not supported
|
||||
try:
|
||||
@ -3406,7 +3407,7 @@ def is_bad_url(url):
|
||||
|
||||
|
||||
def is_nongeneric_url(url):
|
||||
"""Returns True for URLs which are non-http and non-generic.
|
||||
"""Returns True for URLs which are non-http/gemini and non-generic.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@ -3552,7 +3553,7 @@ def parse_decoded_page(page):
|
||||
return (title, desc, keys)
|
||||
|
||||
|
||||
def get_data_from_page(resp):
|
||||
def get_data_from_http_page(resp):
|
||||
"""Detect HTTP response encoding and invoke parser with decoded data.
|
||||
|
||||
Parameters
|
||||
@ -3569,7 +3570,7 @@ def get_data_from_page(resp):
|
||||
try:
|
||||
soup = BeautifulSoup(resp.data, 'html.parser')
|
||||
except Exception as e:
|
||||
LOGERR('get_data_from_page(): %s', e)
|
||||
LOGERR('get_data_from_http_page(): %s', e)
|
||||
|
||||
try:
|
||||
charset = None
|
||||
@ -3598,6 +3599,25 @@ def get_data_from_page(resp):
|
||||
LOGERR(e)
|
||||
return (None, None, None)
|
||||
|
||||
def get_title_from_gemini_page(resp):
|
||||
"""Get the title from a Gemini page
|
||||
|
||||
Parameters
|
||||
----------
|
||||
resp : Gemini response
|
||||
Response from server.
|
||||
|
||||
Returns
|
||||
-------
|
||||
string
|
||||
The page title
|
||||
"""
|
||||
|
||||
for line in resp.payload.split('\n'):
|
||||
if line.startswith('# ') or line.startswith('## '):
|
||||
return line.split(' ', 1)[1]
|
||||
|
||||
return None
|
||||
|
||||
def gen_headers():
|
||||
"""Generate headers for network connection."""
|
||||
@ -3669,6 +3689,22 @@ def network_handler(
|
||||
(title, description, tags, recognized mime, bad url).
|
||||
"""
|
||||
|
||||
GEMINI_CODE_READABLE = {
|
||||
'30': 'Too Many Redirects',
|
||||
'31': 'Too Many Redirects',
|
||||
'40': 'Temporary Failure',
|
||||
'41': 'Server Unavailable',
|
||||
'42': 'CGI Error',
|
||||
'43': 'Proxy Error',
|
||||
'44': 'Slow Down',
|
||||
'50': 'Permanent Failure',
|
||||
'51': 'Not Found',
|
||||
'52': 'Gone',
|
||||
'53': 'Proxy Request Refused',
|
||||
'59': 'Bad Request',
|
||||
'60': 'Client Certificate Required'
|
||||
}
|
||||
|
||||
page_title = None
|
||||
page_desc = None
|
||||
page_keys = None
|
||||
@ -3677,55 +3713,72 @@ def network_handler(
|
||||
if is_nongeneric_url(url) or is_bad_url(url):
|
||||
return (None, None, None, 0, 1)
|
||||
|
||||
if is_ignored_mime(url) or http_head:
|
||||
method = 'HEAD'
|
||||
else:
|
||||
method = 'GET'
|
||||
|
||||
if not MYHEADERS:
|
||||
gen_headers()
|
||||
|
||||
try:
|
||||
manager = get_PoolManager()
|
||||
|
||||
while True:
|
||||
resp = manager.request(method, url, retries=Retry(redirect=10))
|
||||
|
||||
if resp.status == 200:
|
||||
if method == 'GET':
|
||||
page_title, page_desc, page_keys = get_data_from_page(resp)
|
||||
elif resp.status == 403 and url.endswith('/'):
|
||||
# HTTP response Forbidden
|
||||
# Handle URLs in the form of https://www.domain.com/
|
||||
# which fail when trying to fetch resource '/'
|
||||
# retry without trailing '/'
|
||||
|
||||
LOGDBG('Received status 403: retrying...')
|
||||
# Remove trailing /
|
||||
url = url[:-1]
|
||||
resp.close()
|
||||
continue
|
||||
if parse_url(url).scheme == 'gemini':
|
||||
# Required to prevent a too many redirects error from being thrown
|
||||
if url[-1] != '/':
|
||||
url = url + '/'
|
||||
u = Agunua.GeminiUri(url, insecure=True, get_content=True, follow_redirect=True,
|
||||
redirect_depth=5, maxlines=None, maxsize=None)
|
||||
if u.network_success:
|
||||
if u.status_code[0] == '2':
|
||||
page_title = get_title_from_gemini_page(u)
|
||||
elif u.status_code[0] == '1':
|
||||
pass # Input responses have no default title
|
||||
else:
|
||||
LOGERR('[%s] %s', resp.status, resp.reason)
|
||||
LOGERR('[%s] %s', u.status_code, GEMINI_CODE_READABLE[u.status_code])
|
||||
else:
|
||||
LOGERR('network_handler(): %s', u.error)
|
||||
sys.exit(1)
|
||||
else:
|
||||
if is_ignored_mime(url) or http_head:
|
||||
method = 'HEAD'
|
||||
else:
|
||||
method = 'GET'
|
||||
|
||||
if resp:
|
||||
resp.close()
|
||||
if not MYHEADERS:
|
||||
gen_headers()
|
||||
|
||||
break
|
||||
except Exception as e:
|
||||
LOGERR('network_handler(): %s', e)
|
||||
exception = True
|
||||
finally:
|
||||
if manager:
|
||||
manager.clear()
|
||||
if exception:
|
||||
return (None, None, None, 0, 0)
|
||||
if method == 'HEAD':
|
||||
return ('', '', '', 1, 0)
|
||||
if page_title is None:
|
||||
return ('', page_desc, page_keys, 0, 0)
|
||||
try:
|
||||
manager = get_PoolManager()
|
||||
|
||||
return (page_title, page_desc, page_keys, 0, 0)
|
||||
while True:
|
||||
resp = manager.request(method, url, retries=Retry(redirect=10))
|
||||
|
||||
if resp.status == 200:
|
||||
if method == 'GET':
|
||||
page_title, page_desc, page_keys = get_data_from_http_page(resp)
|
||||
elif resp.status == 403 and url.endswith('/'):
|
||||
# HTTP response Forbidden
|
||||
# Handle URLs in the form of https://www.domain.com/
|
||||
# which fail when trying to fetch resource '/'
|
||||
# retry without trailing '/'
|
||||
|
||||
LOGDBG('Received status 403: retrying...')
|
||||
# Remove trailing /
|
||||
url = url[:-1]
|
||||
resp.close()
|
||||
continue
|
||||
else:
|
||||
LOGERR('[%s] %s', resp.status, resp.reason)
|
||||
|
||||
if resp:
|
||||
resp.close()
|
||||
|
||||
break
|
||||
except Exception as e:
|
||||
LOGERR('network_handler(): %s', e)
|
||||
exception = True
|
||||
finally:
|
||||
if manager:
|
||||
manager.clear()
|
||||
if exception:
|
||||
return (None, None, None, 0, 0)
|
||||
if method == 'HEAD':
|
||||
return ('', '', '', 1, 0)
|
||||
if page_title is None:
|
||||
return ('', page_desc, page_keys, 0, 0)
|
||||
|
||||
return (page_title, page_desc, page_keys, 0, 0)
|
||||
|
||||
|
||||
def parse_tags(keywords=[]):
|
||||
|
Loading…
Reference in New Issue
Block a user