Fix race condition, proxy auth handling

1. In case of a full DB refresh, the threads might enter a race condition while
generating the initial headers. Hence, decoupled the header generation logic.

2. Authorization information in https_proxy is now handled correctly in
urllib3 parse_url() way. This was a miss while completely removing urllib
dependency.

3. Handle exceptions due to malformed proxy URL in multiple places.
This commit is contained in:
Arun Prakash Jana 2016-12-11 20:53:48 +05:30
parent 6502fd7a64
commit 2b90a2319f
No known key found for this signature in database
GPG Key ID: A75979F35C080412

83
buku.py
View File

@ -55,8 +55,8 @@ SKIP_MIMES = {'.pdf', '.txt'}
# Disguise as Firefox on Ubuntu # Disguise as Firefox on Ubuntu
USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 \ USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 \
Firefox/50.0' Firefox/50.0'
headers = None # Default dictionary of headers myheaders = None # Default dictionary of headers
proxy = None # Default proxy myproxy = None # Default proxy
# Crypto globals # Crypto globals
BLOCKSIZE = 65536 BLOCKSIZE = 65536
@ -776,6 +776,14 @@ class BukuDb:
done = {'value': 0} # count threads completed done = {'value': 0} # count threads completed
processed = {'value': 0} # count number of records processed processed = {'value': 0} # count number of records processed
# An additional call to generate default headers
# gen_headers() is called within network_handler()
# However, this initial call to setup headers
# ensures there is no race condition among the
# initial threads to setup headers
if not myheaders:
gen_headers()
cond = threading.Condition() cond = threading.Condition()
cond.acquire() cond.acquire()
@ -1576,42 +1584,46 @@ def get_page_title(resp):
return parser.parsed_title return parser.parsed_title
def gen_headers():
'''Generate headers for network connection'''
global myheaders, myproxy
myheaders = {
'Accept-Encoding': 'gzip,deflate',
'User-Agent': USER_AGENT,
'Accept': '*/*',
'Cookie': '',
'DNT': '1'
}
myproxy = os.environ.get('https_proxy')
if myproxy:
try:
url = parse_url(myproxy)
except Exception as e:
logerr(e)
return
# Strip username and password (if present) and update headers
if url.auth:
myproxy = myproxy.replace(url.auth + '@', '')
auth_headers = make_headers(basic_auth=url.auth)
myheaders.update(auth_headers)
logdbg('proxy: [%s]', myproxy)
def get_PoolManager(): def get_PoolManager():
'''Creates a pool manager with proxy support, if applicable '''Creates a pool manager with proxy support, if applicable
:return: ProxyManager if https_proxy is defined, else PoolManager. :return: ProxyManager if https_proxy is defined, else PoolManager.
''' '''
global headers, proxy if myproxy:
return urllib3.ProxyManager(myproxy, num_pools=1, headers=myheaders)
if not headers: return urllib3.PoolManager(num_pools=1, headers=myheaders)
headers = {
'Accept-Encoding': 'gzip,deflate',
'User-Agent': USER_AGENT,
'Accept': '*/*',
'Cookie': '',
'DNT': '1'
}
proxy = os.environ.get('https_proxy')
if proxy:
url = parse_url(proxy)
# Strip username and password and create header, if present
if url.username:
proxy = proxy.replace(
url.username + ':' + url.password + '@', ''
)
auth_headers = make_headers(
basic_auth=url.username + ':' + url.password
)
headers.update(auth_headers)
logdbg('proxy: [%s]', proxy)
if proxy:
return urllib3.ProxyManager(proxy, num_pools=1, headers=headers)
return urllib3.PoolManager(num_pools=1, headers=headers)
def network_handler(url): def network_handler(url):
@ -1621,6 +1633,7 @@ def network_handler(url):
:return: (title, recognized mime, bad url) tuple :return: (title, recognized mime, bad url) tuple
''' '''
http_handler = None
page_title = None page_title = None
resp = None resp = None
method = 'GET' method = 'GET'
@ -1631,9 +1644,12 @@ def network_handler(url):
if is_ignored_mime(url): if is_ignored_mime(url):
method = 'HEAD' method = 'HEAD'
http_handler = get_PoolManager() if not myheaders:
gen_headers()
try: try:
http_handler = get_PoolManager()
while True: while True:
resp = http_handler.request(method, url, timeout=40) resp = http_handler.request(method, url, timeout=40)
@ -1661,7 +1677,8 @@ def network_handler(url):
except Exception as e: except Exception as e:
logerr('network_handler(): %s', e) logerr('network_handler(): %s', e)
finally: finally:
http_handler.clear() if http_handler:
http_handler.clear()
if method == 'HEAD': if method == 'HEAD':
return ('', 1, 0) return ('', 1, 0)
if page_title is None: if page_title is None: