Add support for the Gemini protocol

2021-06-19 21:42:11 -05:00 · 2021-06-19 21:42:11 -05:00 · 417ac7f654
commit 417ac7f654
parent 6ad0a52f82
3 changed files with 104 additions and 48 deletions
--- a/README.md
+++ b/README.md
@ -77,6 +77,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
 - Multi-threaded full DB refresh, manual encryption support
 - Shell completion scripts, man page with examples
 - Privacy-aware (no unconfirmed user data collection)
 - Supports the Gemini protocol
 ### Installation
@ -88,6 +89,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
 | HTTPS | certifi, urllib3 |
 | Encryption | cryptography |
 | HTML | beautifulsoup4, html5lib |
 | Gemini | agunua |
 To copy URL to clipboard `buku` uses `xsel` (or `xclip`) on Linux, `pbcopy` (default installed) on OS X, `clip` (default installed) on Windows, `termux-clipboard` on Termux (terminal emulation for Android), `wl-copy` on Wayland. If X11 is missing, GNU Screen or tmux copy-paste buffers are recognized.
--- a/61
+++ b/61
@ -49,6 +49,7 @@ import urllib3
 from urllib3.exceptions import LocationParseError
 from urllib3.util import parse_url, make_headers, Retry
 from bs4 import BeautifulSoup
 import Agunua
 # note catch ModuleNotFoundError instead Exception
 # when python3.5 not supported
 try:
@ -3406,7 +3407,7 @@ def is_bad_url(url):
 def is_nongeneric_url(url):
-    """Returns True for URLs which are non-http and non-generic.
+    """Returns True for URLs which are non-http/gemini and non-generic.
    Parameters
    ----------
@ -3552,7 +3553,7 @@ def parse_decoded_page(page):
    return (title, desc, keys)
-def get_data_from_page(resp):
+def get_data_from_http_page(resp):
    """Detect HTTP response encoding and invoke parser with decoded data.
    Parameters
@ -3569,7 +3570,7 @@ def get_data_from_page(resp):
    try:
        soup = BeautifulSoup(resp.data, 'html.parser')
    except Exception as e:
-        LOGERR('get_data_from_page(): %s', e)
+        LOGERR('get_data_from_http_page(): %s', e)
    try:
        charset = None
@ -3598,6 +3599,25 @@ def get_data_from_page(resp):
        LOGERR(e)
        return (None, None, None)
 def get_title_from_gemini_page(resp):
    """Get the title from a Gemini page
    Parameters
    ----------
    resp : Gemini response
        Response from server.
    Returns
    -------
    string
        The page title
    """
    for line in resp.payload.split('\n'):
        if line.startswith('# ') or line.startswith('## '):
            return line.split(' ', 1)[1]
    return None
 def gen_headers():
    """Generate headers for network connection."""
@ -3669,6 +3689,22 @@ def network_handler(
        (title, description, tags, recognized mime, bad url).
    """
    GEMINI_CODE_READABLE = {
        '30': 'Too Many Redirects',
        '31': 'Too Many Redirects',
        '40': 'Temporary Failure',
        '41': 'Server Unavailable',
        '42': 'CGI Error',
        '43': 'Proxy Error',
        '44': 'Slow Down',
        '50': 'Permanent Failure',
        '51': 'Not Found',
        '52': 'Gone',
        '53': 'Proxy Request Refused',
        '59': 'Bad Request',
        '60': 'Client Certificate Required'
    }
    page_title = None
    page_desc = None
    page_keys = None
@ -3677,6 +3713,23 @@ def network_handler(
    if is_nongeneric_url(url) or is_bad_url(url):
        return (None, None, None, 0, 1)
    if parse_url(url).scheme == 'gemini':
        # Required to prevent a too many redirects error from being thrown
        if url[-1] != '/':
            url = url + '/'
        u = Agunua.GeminiUri(url, insecure=True, get_content=True, follow_redirect=True,
                             redirect_depth=5, maxlines=None, maxsize=None)
        if u.network_success:
            if u.status_code[0] == '2':
                page_title = get_title_from_gemini_page(u)
            elif u.status_code[0] == '1':
                pass  # Input responses have no default title
            else:
                LOGERR('[%s] %s', u.status_code, GEMINI_CODE_READABLE[u.status_code])
        else:
            LOGERR('network_handler(): %s', u.error)
            sys.exit(1)
    else:
        if is_ignored_mime(url) or http_head:
            method = 'HEAD'
        else:
@ -3693,7 +3746,7 @@ def network_handler(
                if resp.status == 200:
                    if method == 'GET':
-                    page_title, page_desc, page_keys = get_data_from_page(resp)
+                        page_title, page_desc, page_keys = get_data_from_http_page(resp)
                elif resp.status == 403 and url.endswith('/'):
                    # HTTP response Forbidden
                    # Handle URLs in the form of https://www.domain.com/
--- a/setup.py
+++ b/setup.py
@ -77,6 +77,7 @@ setup(
        'cryptography>=1.2.3',
        'urllib3>=1.23',
        'html5lib>=1.0.1',
        'agunua>=1.4'
    ],
    packages=find_packages(exclude=['tests']),
    include_package_data=True,