Add support for the Gemini protocol

2021-06-19 21:42:11 -05:00 · 2021-06-19 21:42:11 -05:00 · 417ac7f654
commit 417ac7f654
parent 6ad0a52f82
3 changed files with 104 additions and 48 deletions
--- a/README.md
+++ b/README.md
@ -77,6 +77,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
 - Multi-threaded full DB refresh, manual encryption support
 - Shell completion scripts, man page with examples
 - Privacy-aware (no unconfirmed user data collection)
+- Supports the Gemini protocol

 ### Installation

@ -88,6 +89,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
 | HTTPS | certifi, urllib3 |
 | Encryption | cryptography |
 | HTML | beautifulsoup4, html5lib |
+| Gemini | agunua |

 To copy URL to clipboard `buku` uses `xsel` (or `xclip`) on Linux, `pbcopy` (default installed) on OS X, `clip` (default installed) on Windows, `termux-clipboard` on Termux (terminal emulation for Android), `wl-copy` on Wayland. If X11 is missing, GNU Screen or tmux copy-paste buffers are recognized.

--- a/149
+++ b/149
@ -49,6 +49,7 @@ import urllib3
 from urllib3.exceptions import LocationParseError
 from urllib3.util import parse_url, make_headers, Retry
 from bs4 import BeautifulSoup
+import Agunua
 # note catch ModuleNotFoundError instead Exception
 # when python3.5 not supported
 try:
@ -3406,7 +3407,7 @@ def is_bad_url(url):


 def is_nongeneric_url(url):
-    """Returns True for URLs which are non-http and non-generic.
+    """Returns True for URLs which are non-http/gemini and non-generic.

    Parameters
    ----------
@ -3552,7 +3553,7 @@ def parse_decoded_page(page):
    return (title, desc, keys)


-def get_data_from_page(resp):
+def get_data_from_http_page(resp):
    """Detect HTTP response encoding and invoke parser with decoded data.

    Parameters
@ -3569,7 +3570,7 @@ def get_data_from_page(resp):
    try:
        soup = BeautifulSoup(resp.data, 'html.parser')
    except Exception as e:
-        LOGERR('get_data_from_page(): %s', e)
+        LOGERR('get_data_from_http_page(): %s', e)

    try:
        charset = None
@ -3598,6 +3599,25 @@ def get_data_from_page(resp):
        LOGERR(e)
        return (None, None, None)

+def get_title_from_gemini_page(resp):
+    """Get the title from a Gemini page
+
+    Parameters
+    ----------
+    resp : Gemini response
+        Response from server.
+
+    Returns
+    -------
+    string
+        The page title
+    """
+
+    for line in resp.payload.split('\n'):
+        if line.startswith('# ') or line.startswith('## '):
+            return line.split(' ', 1)[1]
+
+    return None

 def gen_headers():
    """Generate headers for network connection."""
@ -3669,6 +3689,22 @@ def network_handler(
        (title, description, tags, recognized mime, bad url).
    """

+    GEMINI_CODE_READABLE = {
+        '30': 'Too Many Redirects',
+        '31': 'Too Many Redirects',
+        '40': 'Temporary Failure',
+        '41': 'Server Unavailable',
+        '42': 'CGI Error',
+        '43': 'Proxy Error',
+        '44': 'Slow Down',
+        '50': 'Permanent Failure',
+        '51': 'Not Found',
+        '52': 'Gone',
+        '53': 'Proxy Request Refused',
+        '59': 'Bad Request',
+        '60': 'Client Certificate Required'
+    }
+
    page_title = None
    page_desc = None
    page_keys = None
@ -3677,55 +3713,72 @@ def network_handler(
    if is_nongeneric_url(url) or is_bad_url(url):
        return (None, None, None, 0, 1)

-    if is_ignored_mime(url) or http_head:
-        method = 'HEAD'
-    else:
-        method = 'GET'
-
-    if not MYHEADERS:
-        gen_headers()
-
-    try:
-        manager = get_PoolManager()
-
-        while True:
-            resp = manager.request(method, url, retries=Retry(redirect=10))
-
-            if resp.status == 200:
-                if method == 'GET':
-                    page_title, page_desc, page_keys = get_data_from_page(resp)
-            elif resp.status == 403 and url.endswith('/'):
-                # HTTP response Forbidden
-                # Handle URLs in the form of https://www.domain.com/
-                # which fail when trying to fetch resource '/'
-                # retry without trailing '/'
-
-                LOGDBG('Received status 403: retrying...')
-                # Remove trailing /
-                url = url[:-1]
-                resp.close()
-                continue
+    if parse_url(url).scheme == 'gemini':
+        # Required to prevent a too many redirects error from being thrown
+        if url[-1] != '/':
+            url = url + '/'
+        u = Agunua.GeminiUri(url, insecure=True, get_content=True, follow_redirect=True,
+                             redirect_depth=5, maxlines=None, maxsize=None)
+        if u.network_success:
+            if u.status_code[0] == '2':
+                page_title = get_title_from_gemini_page(u)
+            elif u.status_code[0] == '1':
+                pass  # Input responses have no default title
            else:
-                LOGERR('[%s] %s', resp.status, resp.reason)
+                LOGERR('[%s] %s', u.status_code, GEMINI_CODE_READABLE[u.status_code])
+        else:
+            LOGERR('network_handler(): %s', u.error)
+            sys.exit(1)
+    else:
+        if is_ignored_mime(url) or http_head:
+            method = 'HEAD'
+        else:
+            method = 'GET'

-            if resp:
-                resp.close()
+        if not MYHEADERS:
+            gen_headers()

-            break
-    except Exception as e:
-        LOGERR('network_handler(): %s', e)
-        exception = True
-    finally:
-        if manager:
-            manager.clear()
-        if exception:
-            return (None, None, None, 0, 0)
-        if method == 'HEAD':
-            return ('', '', '', 1, 0)
-        if page_title is None:
-            return ('', page_desc, page_keys, 0, 0)
+        try:
+            manager = get_PoolManager()

-        return (page_title, page_desc, page_keys, 0, 0)
+            while True:
+                resp = manager.request(method, url, retries=Retry(redirect=10))
+
+                if resp.status == 200:
+                    if method == 'GET':
+                        page_title, page_desc, page_keys = get_data_from_http_page(resp)
+                elif resp.status == 403 and url.endswith('/'):
+                    # HTTP response Forbidden
+                    # Handle URLs in the form of https://www.domain.com/
+                    # which fail when trying to fetch resource '/'
+                    # retry without trailing '/'
+
+                    LOGDBG('Received status 403: retrying...')
+                    # Remove trailing /
+                    url = url[:-1]
+                    resp.close()
+                    continue
+                else:
+                    LOGERR('[%s] %s', resp.status, resp.reason)
+
+                if resp:
+                    resp.close()
+
+                break
+        except Exception as e:
+            LOGERR('network_handler(): %s', e)
+            exception = True
+        finally:
+            if manager:
+                manager.clear()
+            if exception:
+                return (None, None, None, 0, 0)
+            if method == 'HEAD':
+                return ('', '', '', 1, 0)
+            if page_title is None:
+                return ('', page_desc, page_keys, 0, 0)
+
+    return (page_title, page_desc, page_keys, 0, 0)


 def parse_tags(keywords=[]):
--- a/setup.py
+++ b/setup.py
@ -77,6 +77,7 @@ setup(
        'cryptography>=1.2.3',
        'urllib3>=1.23',
        'html5lib>=1.0.1',
+        'agunua>=1.4'
    ],
    packages=find_packages(exclude=['tests']),
    include_package_data=True,