Reformat: distinct API to fetch page data.

Signed-off-by: Arun Prakash Jana <engineerarun@gmail.com>
This commit is contained in:
Arun Prakash Jana 2016-03-23 20:28:21 +05:30
parent e832826118
commit 2708e70004

106
buku
View File

@ -107,86 +107,80 @@ def usage():
# Get page response data
def getPageResp(url, redir=False):
marker = -1
if url.find("https://") >= 0: # Secure connection
server = url[8:]
marker = server.find("/")
if marker > 0:
if redir == False:
url = server[marker:]
server = server[:marker]
urlconn = HTTPSConnection(server, timeout=30)
elif url.find("http://") >= 0: # Insecure connection
server = url[7:]
marker = server.find("/")
if marker > 0:
if redir == False:
url = server[marker:]
server = server[:marker]
urlconn = HTTPConnection(server, timeout=30)
else:
return (None, None)
if debug:
print("server: [%s]" % server)
print("URL: [%s]" % unquote(url))
urlconn.request("GET", unquote(url))
resp = urlconn.getresponse()
return (resp, urlconn)
# Fetch title from URL # Fetch title from URL
def fetchTitle(url): def fetchTitle(url):
global titleData global titleData
secure = True urlconn = None
if url.find("https://") >= 0:
server = url[8:]
elif url.find("http://") >= 0:
secure = False
server = url[7:]
else:
return ''
marker = server.find("/")
if marker > 0:
fetchurl = server[marker:]
server = server[:marker]
else:
fetchurl = url
try: try:
if debug: resp, urlconn = getPageResp(url, False)
print("server: [%s]" % server) if resp is None:
if secure == True: return ''
urlconn = HTTPSConnection(server, timeout=30)
else:
urlconn = HTTPConnection(server, timeout=30)
if debug:
print("URL: [%s]" % fetchurl)
urlconn.request("GET", fetchurl)
resp = urlconn.getresponse()
if resp.status != 200: if resp.status != 200:
# Handle first redirection # Handle first redirection
if resp.status in (301,302,): if resp.status in (301,302,):
if debug: if debug:
print(resp.getheader('location', '')) print("Location header: %s" % resp.getheader('location', ''))
redirurl = urljoin(url, resp.getheader('location', '')) redirurl = urljoin(url, resp.getheader('location', ''))
if redirurl.find("sorry/IndexRedirect?") >= 0: if redirurl.find("sorry/IndexRedirect?") >= 0:
print("ERROR: Connection blocked due to unusual activity.") print("ERROR: Connection blocked due to unusual activity.")
else: else:
urlconn.close()
if url.find("https://") >= 0: # Secure connection
server = redirurl[8:]
marker = server.find("/")
if marker > 0:
server = server[:marker]
urlconn = HTTPSConnection(server, timeout=30)
else:
server = redirurl[7:]
marker = server.find("/")
if marker > 0:
server = server[:marker]
urlconn = HTTPConnection(server, timeout=30)
if debug: if debug:
print("Redir server: [%s]" % server) print("Trying to fetch redirected URL.")
print("Redir URL: [%s]" % unquote(redirurl)) urlconn.close()
resp, urlconn = getPageResp(redirurl, True)
urlconn.request("GET", unquote(redirurl)) if resp is not None:
resp = urlconn.getresponse() if resp.status != 200:
if resp.status != 200: print("ERROR on retry:", str(resp.status), ": ", resp.reason)
print("ERROR on retry:", str(resp.status), ": ", resp.reason) else:
else: getTitleData(resp)
getTitleData(resp)
if titleData is None:
titleData = ''
else: # if resp.status in (301,302,): else: # if resp.status in (301,302,):
print("ERROR:", str(resp.status), ": ", resp.reason) print("ERROR:", str(resp.status), ": ", resp.reason)
else: # if resp.status != 200: else: # if resp.status != 200:
getTitleData(resp) getTitleData(resp)
if titleData is None:
titleData = ''
except Exception as e: except Exception as e:
print("Exception: %s" % e) print("Exception: %s" % e)
titleData = ''
finally: finally:
urlconn.close() if urlconn is not None:
urlconn.close()
if titleData is None:
return ''
return titleData.strip().replace("\n","") return titleData.strip().replace("\n","")