Request gzip compressed data in HTTP(S) fetch.

This commit is contained in:
Arun Prakash Jana 2016-04-09 17:22:47 +05:30
parent 4c25e0b1c0
commit e5ca44e251
No known key found for this signature in database
GPG Key ID: C0A712ED95043DCB

19
buku
View File

@ -26,6 +26,8 @@ import webbrowser
import html.parser as HTMLParser import html.parser as HTMLParser
from http.client import HTTPConnection, HTTPSConnection from http.client import HTTPConnection, HTTPSConnection
from urllib.parse import urljoin, quote, unquote from urllib.parse import urljoin, quote, unquote
import gzip
import io
import signal import signal
import shutil import shutil
@ -176,6 +178,7 @@ def initdb():
def getPageResp(url, redir=False): def getPageResp(url, redir=False):
"""Connect to a server and fetch the requested page data. """Connect to a server and fetch the requested page data.
Supports gzip compression.
Params: URL to fetch, redirection status Params: URL to fetch, redirection status
Returns: connection, HTTP(S) GET response Returns: connection, HTTP(S) GET response
@ -220,7 +223,9 @@ def getPageResp(url, redir=False):
except: except:
url = quote(url) url = quote(url)
urlconn.request("GET", url) urlconn.request("GET", url, None, {
"Accept-encoding": "gzip",
})
resp = urlconn.getresponse() resp = urlconn.getresponse()
return (urlconn, resp) return (urlconn, resp)
@ -232,8 +237,16 @@ def getTitleData(resp):
Params: GET response Params: GET response
""" """
data = None
charset = '' charset = ''
charset = resp.headers.get_content_charset() charset = resp.headers.get_content_charset()
if resp.headers.get('Content-Encoding') == 'gzip':
print("gzip")
data = gzip.GzipFile(fileobj=io.BytesIO(resp.read())).read()
else:
data = resp.read()
if charset == None: if charset == None:
charset = 'utf-8' charset = 'utf-8'
if debug: if debug:
@ -242,9 +255,9 @@ def getTitleData(resp):
parser = BMHTMLParser() parser = BMHTMLParser()
try: try:
if charset == 'utf-8': if charset == 'utf-8':
parser.feed(resp.read().decode(charset, "replace")) parser.feed(data.decode(charset, "replace"))
else: else:
parser.feed(resp.read().decode(charset)) parser.feed(data.decode(charset))
except Exception as e: except Exception as e:
if debug: if debug:
print("Exception [getTitleData]: %s" % e) print("Exception [getTitleData]: %s" % e)