Fix race condition, proxy auth handling

1. In case of a full DB refresh, the threads might enter a race condition while
generating the initial headers. Hence, decoupled the header generation logic.

2. Authorization information in https_proxy is now handled correctly in
urllib3 parse_url() way. This was a miss while completely removing urllib
dependency.

3. Handle exceptions due to malformed proxy URL in multiple places.
This commit is contained in:
Arun Prakash Jana 2016-12-11 20:53:48 +05:30
parent 6502fd7a64
commit 2b90a2319f
No known key found for this signature in database
GPG Key ID: A75979F35C080412

71
buku.py
View File

@ -55,8 +55,8 @@ SKIP_MIMES = {'.pdf', '.txt'}
# Disguise as Firefox on Ubuntu # Disguise as Firefox on Ubuntu
USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 \ USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 \
Firefox/50.0' Firefox/50.0'
headers = None # Default dictionary of headers myheaders = None # Default dictionary of headers
proxy = None # Default proxy myproxy = None # Default proxy
# Crypto globals # Crypto globals
BLOCKSIZE = 65536 BLOCKSIZE = 65536
@ -776,6 +776,14 @@ class BukuDb:
done = {'value': 0} # count threads completed done = {'value': 0} # count threads completed
processed = {'value': 0} # count number of records processed processed = {'value': 0} # count number of records processed
# An additional call to generate default headers
# gen_headers() is called within network_handler()
# However, this initial call to setup headers
# ensures there is no race condition among the
# initial threads to setup headers
if not myheaders:
gen_headers()
cond = threading.Condition() cond = threading.Condition()
cond.acquire() cond.acquire()
@ -1576,16 +1584,12 @@ def get_page_title(resp):
return parser.parsed_title return parser.parsed_title
def get_PoolManager(): def gen_headers():
'''Creates a pool manager with proxy support, if applicable '''Generate headers for network connection'''
:return: ProxyManager if https_proxy is defined, else PoolManager. global myheaders, myproxy
'''
global headers, proxy myheaders = {
if not headers:
headers = {
'Accept-Encoding': 'gzip,deflate', 'Accept-Encoding': 'gzip,deflate',
'User-Agent': USER_AGENT, 'User-Agent': USER_AGENT,
'Accept': '*/*', 'Accept': '*/*',
@ -1593,25 +1597,33 @@ def get_PoolManager():
'DNT': '1' 'DNT': '1'
} }
proxy = os.environ.get('https_proxy') myproxy = os.environ.get('https_proxy')
if proxy: if myproxy:
url = parse_url(proxy) try:
# Strip username and password and create header, if present url = parse_url(myproxy)
if url.username: except Exception as e:
proxy = proxy.replace( logerr(e)
url.username + ':' + url.password + '@', '' return
)
auth_headers = make_headers(
basic_auth=url.username + ':' + url.password
)
headers.update(auth_headers)
logdbg('proxy: [%s]', proxy) # Strip username and password (if present) and update headers
if url.auth:
myproxy = myproxy.replace(url.auth + '@', '')
auth_headers = make_headers(basic_auth=url.auth)
myheaders.update(auth_headers)
if proxy: logdbg('proxy: [%s]', myproxy)
return urllib3.ProxyManager(proxy, num_pools=1, headers=headers)
return urllib3.PoolManager(num_pools=1, headers=headers)
def get_PoolManager():
'''Creates a pool manager with proxy support, if applicable
:return: ProxyManager if https_proxy is defined, else PoolManager.
'''
if myproxy:
return urllib3.ProxyManager(myproxy, num_pools=1, headers=myheaders)
return urllib3.PoolManager(num_pools=1, headers=myheaders)
def network_handler(url): def network_handler(url):
@ -1621,6 +1633,7 @@ def network_handler(url):
:return: (title, recognized mime, bad url) tuple :return: (title, recognized mime, bad url) tuple
''' '''
http_handler = None
page_title = None page_title = None
resp = None resp = None
method = 'GET' method = 'GET'
@ -1631,9 +1644,12 @@ def network_handler(url):
if is_ignored_mime(url): if is_ignored_mime(url):
method = 'HEAD' method = 'HEAD'
http_handler = get_PoolManager() if not myheaders:
gen_headers()
try: try:
http_handler = get_PoolManager()
while True: while True:
resp = http_handler.request(method, url, timeout=40) resp = http_handler.request(method, url, timeout=40)
@ -1661,6 +1677,7 @@ def network_handler(url):
except Exception as e: except Exception as e:
logerr('network_handler(): %s', e) logerr('network_handler(): %s', e)
finally: finally:
if http_handler:
http_handler.clear() http_handler.clear()
if method == 'HEAD': if method == 'HEAD':
return ('', 1, 0) return ('', 1, 0)