Add support for the Gemini protocol
This commit is contained in:
parent
6ad0a52f82
commit
417ac7f654
@ -77,6 +77,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
|
|||||||
- Multi-threaded full DB refresh, manual encryption support
|
- Multi-threaded full DB refresh, manual encryption support
|
||||||
- Shell completion scripts, man page with examples
|
- Shell completion scripts, man page with examples
|
||||||
- Privacy-aware (no unconfirmed user data collection)
|
- Privacy-aware (no unconfirmed user data collection)
|
||||||
|
- Supports the Gemini protocol
|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
|
|
||||||
@ -88,6 +89,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
|
|||||||
| HTTPS | certifi, urllib3 |
|
| HTTPS | certifi, urllib3 |
|
||||||
| Encryption | cryptography |
|
| Encryption | cryptography |
|
||||||
| HTML | beautifulsoup4, html5lib |
|
| HTML | beautifulsoup4, html5lib |
|
||||||
|
| Gemini | agunua |
|
||||||
|
|
||||||
To copy URL to clipboard `buku` uses `xsel` (or `xclip`) on Linux, `pbcopy` (default installed) on OS X, `clip` (default installed) on Windows, `termux-clipboard` on Termux (terminal emulation for Android), `wl-copy` on Wayland. If X11 is missing, GNU Screen or tmux copy-paste buffers are recognized.
|
To copy URL to clipboard `buku` uses `xsel` (or `xclip`) on Linux, `pbcopy` (default installed) on OS X, `clip` (default installed) on Windows, `termux-clipboard` on Termux (terminal emulation for Android), `wl-copy` on Wayland. If X11 is missing, GNU Screen or tmux copy-paste buffers are recognized.
|
||||||
|
|
||||||
|
149
buku
149
buku
@ -49,6 +49,7 @@ import urllib3
|
|||||||
from urllib3.exceptions import LocationParseError
|
from urllib3.exceptions import LocationParseError
|
||||||
from urllib3.util import parse_url, make_headers, Retry
|
from urllib3.util import parse_url, make_headers, Retry
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import Agunua
|
||||||
# note catch ModuleNotFoundError instead Exception
|
# note catch ModuleNotFoundError instead Exception
|
||||||
# when python3.5 not supported
|
# when python3.5 not supported
|
||||||
try:
|
try:
|
||||||
@ -3406,7 +3407,7 @@ def is_bad_url(url):
|
|||||||
|
|
||||||
|
|
||||||
def is_nongeneric_url(url):
|
def is_nongeneric_url(url):
|
||||||
"""Returns True for URLs which are non-http and non-generic.
|
"""Returns True for URLs which are non-http/gemini and non-generic.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@ -3552,7 +3553,7 @@ def parse_decoded_page(page):
|
|||||||
return (title, desc, keys)
|
return (title, desc, keys)
|
||||||
|
|
||||||
|
|
||||||
def get_data_from_page(resp):
|
def get_data_from_http_page(resp):
|
||||||
"""Detect HTTP response encoding and invoke parser with decoded data.
|
"""Detect HTTP response encoding and invoke parser with decoded data.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -3569,7 +3570,7 @@ def get_data_from_page(resp):
|
|||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(resp.data, 'html.parser')
|
soup = BeautifulSoup(resp.data, 'html.parser')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOGERR('get_data_from_page(): %s', e)
|
LOGERR('get_data_from_http_page(): %s', e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
charset = None
|
charset = None
|
||||||
@ -3598,6 +3599,25 @@ def get_data_from_page(resp):
|
|||||||
LOGERR(e)
|
LOGERR(e)
|
||||||
return (None, None, None)
|
return (None, None, None)
|
||||||
|
|
||||||
|
def get_title_from_gemini_page(resp):
|
||||||
|
"""Get the title from a Gemini page
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
resp : Gemini response
|
||||||
|
Response from server.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
string
|
||||||
|
The page title
|
||||||
|
"""
|
||||||
|
|
||||||
|
for line in resp.payload.split('\n'):
|
||||||
|
if line.startswith('# ') or line.startswith('## '):
|
||||||
|
return line.split(' ', 1)[1]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def gen_headers():
|
def gen_headers():
|
||||||
"""Generate headers for network connection."""
|
"""Generate headers for network connection."""
|
||||||
@ -3669,6 +3689,22 @@ def network_handler(
|
|||||||
(title, description, tags, recognized mime, bad url).
|
(title, description, tags, recognized mime, bad url).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
GEMINI_CODE_READABLE = {
|
||||||
|
'30': 'Too Many Redirects',
|
||||||
|
'31': 'Too Many Redirects',
|
||||||
|
'40': 'Temporary Failure',
|
||||||
|
'41': 'Server Unavailable',
|
||||||
|
'42': 'CGI Error',
|
||||||
|
'43': 'Proxy Error',
|
||||||
|
'44': 'Slow Down',
|
||||||
|
'50': 'Permanent Failure',
|
||||||
|
'51': 'Not Found',
|
||||||
|
'52': 'Gone',
|
||||||
|
'53': 'Proxy Request Refused',
|
||||||
|
'59': 'Bad Request',
|
||||||
|
'60': 'Client Certificate Required'
|
||||||
|
}
|
||||||
|
|
||||||
page_title = None
|
page_title = None
|
||||||
page_desc = None
|
page_desc = None
|
||||||
page_keys = None
|
page_keys = None
|
||||||
@ -3677,55 +3713,72 @@ def network_handler(
|
|||||||
if is_nongeneric_url(url) or is_bad_url(url):
|
if is_nongeneric_url(url) or is_bad_url(url):
|
||||||
return (None, None, None, 0, 1)
|
return (None, None, None, 0, 1)
|
||||||
|
|
||||||
if is_ignored_mime(url) or http_head:
|
if parse_url(url).scheme == 'gemini':
|
||||||
method = 'HEAD'
|
# Required to prevent a too many redirects error from being thrown
|
||||||
else:
|
if url[-1] != '/':
|
||||||
method = 'GET'
|
url = url + '/'
|
||||||
|
u = Agunua.GeminiUri(url, insecure=True, get_content=True, follow_redirect=True,
|
||||||
if not MYHEADERS:
|
redirect_depth=5, maxlines=None, maxsize=None)
|
||||||
gen_headers()
|
if u.network_success:
|
||||||
|
if u.status_code[0] == '2':
|
||||||
try:
|
page_title = get_title_from_gemini_page(u)
|
||||||
manager = get_PoolManager()
|
elif u.status_code[0] == '1':
|
||||||
|
pass # Input responses have no default title
|
||||||
while True:
|
|
||||||
resp = manager.request(method, url, retries=Retry(redirect=10))
|
|
||||||
|
|
||||||
if resp.status == 200:
|
|
||||||
if method == 'GET':
|
|
||||||
page_title, page_desc, page_keys = get_data_from_page(resp)
|
|
||||||
elif resp.status == 403 and url.endswith('/'):
|
|
||||||
# HTTP response Forbidden
|
|
||||||
# Handle URLs in the form of https://www.domain.com/
|
|
||||||
# which fail when trying to fetch resource '/'
|
|
||||||
# retry without trailing '/'
|
|
||||||
|
|
||||||
LOGDBG('Received status 403: retrying...')
|
|
||||||
# Remove trailing /
|
|
||||||
url = url[:-1]
|
|
||||||
resp.close()
|
|
||||||
continue
|
|
||||||
else:
|
else:
|
||||||
LOGERR('[%s] %s', resp.status, resp.reason)
|
LOGERR('[%s] %s', u.status_code, GEMINI_CODE_READABLE[u.status_code])
|
||||||
|
else:
|
||||||
|
LOGERR('network_handler(): %s', u.error)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
if is_ignored_mime(url) or http_head:
|
||||||
|
method = 'HEAD'
|
||||||
|
else:
|
||||||
|
method = 'GET'
|
||||||
|
|
||||||
if resp:
|
if not MYHEADERS:
|
||||||
resp.close()
|
gen_headers()
|
||||||
|
|
||||||
break
|
try:
|
||||||
except Exception as e:
|
manager = get_PoolManager()
|
||||||
LOGERR('network_handler(): %s', e)
|
|
||||||
exception = True
|
|
||||||
finally:
|
|
||||||
if manager:
|
|
||||||
manager.clear()
|
|
||||||
if exception:
|
|
||||||
return (None, None, None, 0, 0)
|
|
||||||
if method == 'HEAD':
|
|
||||||
return ('', '', '', 1, 0)
|
|
||||||
if page_title is None:
|
|
||||||
return ('', page_desc, page_keys, 0, 0)
|
|
||||||
|
|
||||||
return (page_title, page_desc, page_keys, 0, 0)
|
while True:
|
||||||
|
resp = manager.request(method, url, retries=Retry(redirect=10))
|
||||||
|
|
||||||
|
if resp.status == 200:
|
||||||
|
if method == 'GET':
|
||||||
|
page_title, page_desc, page_keys = get_data_from_http_page(resp)
|
||||||
|
elif resp.status == 403 and url.endswith('/'):
|
||||||
|
# HTTP response Forbidden
|
||||||
|
# Handle URLs in the form of https://www.domain.com/
|
||||||
|
# which fail when trying to fetch resource '/'
|
||||||
|
# retry without trailing '/'
|
||||||
|
|
||||||
|
LOGDBG('Received status 403: retrying...')
|
||||||
|
# Remove trailing /
|
||||||
|
url = url[:-1]
|
||||||
|
resp.close()
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
LOGERR('[%s] %s', resp.status, resp.reason)
|
||||||
|
|
||||||
|
if resp:
|
||||||
|
resp.close()
|
||||||
|
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
LOGERR('network_handler(): %s', e)
|
||||||
|
exception = True
|
||||||
|
finally:
|
||||||
|
if manager:
|
||||||
|
manager.clear()
|
||||||
|
if exception:
|
||||||
|
return (None, None, None, 0, 0)
|
||||||
|
if method == 'HEAD':
|
||||||
|
return ('', '', '', 1, 0)
|
||||||
|
if page_title is None:
|
||||||
|
return ('', page_desc, page_keys, 0, 0)
|
||||||
|
|
||||||
|
return (page_title, page_desc, page_keys, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
def parse_tags(keywords=[]):
|
def parse_tags(keywords=[]):
|
||||||
|
1
setup.py
1
setup.py
@ -77,6 +77,7 @@ setup(
|
|||||||
'cryptography>=1.2.3',
|
'cryptography>=1.2.3',
|
||||||
'urllib3>=1.23',
|
'urllib3>=1.23',
|
||||||
'html5lib>=1.0.1',
|
'html5lib>=1.0.1',
|
||||||
|
'agunua>=1.4'
|
||||||
],
|
],
|
||||||
packages=find_packages(exclude=['tests']),
|
packages=find_packages(exclude=['tests']),
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
|
Loading…
Reference in New Issue
Block a user