From e5ca44e251c1dc7d94cbf3d958b2cc147d5c9fc5 Mon Sep 17 00:00:00 2001 From: Arun Prakash Jana Date: Sat, 9 Apr 2016 17:22:47 +0530 Subject: [PATCH] Request gzip compressed data in HTTP(S) fetch. --- buku | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/buku b/buku index 081cef9..fe94220 100755 --- a/buku +++ b/buku @@ -26,6 +26,8 @@ import webbrowser import html.parser as HTMLParser from http.client import HTTPConnection, HTTPSConnection from urllib.parse import urljoin, quote, unquote +import gzip +import io import signal import shutil @@ -176,6 +178,7 @@ def initdb(): def getPageResp(url, redir=False): """Connect to a server and fetch the requested page data. + Supports gzip compression. Params: URL to fetch, redirection status Returns: connection, HTTP(S) GET response @@ -220,7 +223,9 @@ def getPageResp(url, redir=False): except: url = quote(url) - urlconn.request("GET", url) + urlconn.request("GET", url, None, { + "Accept-encoding": "gzip", + }) resp = urlconn.getresponse() return (urlconn, resp) @@ -232,8 +237,16 @@ def getTitleData(resp): Params: GET response """ + data = None charset = '' charset = resp.headers.get_content_charset() + + if resp.headers.get('Content-Encoding') == 'gzip': + print("gzip") + data = gzip.GzipFile(fileobj=io.BytesIO(resp.read())).read() + else: + data = resp.read() + if charset == None: charset = 'utf-8' if debug: @@ -242,9 +255,9 @@ def getTitleData(resp): parser = BMHTMLParser() try: if charset == 'utf-8': - parser.feed(resp.read().decode(charset, "replace")) + parser.feed(data.decode(charset, "replace")) else: - parser.feed(resp.read().decode(charset)) + parser.feed(data.decode(charset)) except Exception as e: if debug: print("Exception [getTitleData]: %s" % e)