chg: dev: use bs4 as backup parser
This commit is contained in:
parent
1a0b5bd328
commit
92c6e5c7c7
21
buku.py
21
buku.py
@ -2953,20 +2953,37 @@ def get_page_title(resp):
|
||||
|
||||
parser = BukuHTMLParser()
|
||||
charset = 'utf-8'
|
||||
soup = None
|
||||
parsed_title = None
|
||||
|
||||
try:
|
||||
charset = 'utf-8'
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(resp.data, 'html.parser')
|
||||
except Exception as e:
|
||||
logerr('get_page_title(): %s', e)
|
||||
try:
|
||||
charset_found = False
|
||||
if 'content-type' in resp.headers:
|
||||
_, params = cgi.parse_header(resp.headers['content-type'])
|
||||
if params.get('charset') is not None:
|
||||
charset = params.get('charset')
|
||||
charset_found = True
|
||||
if not charset_found and soup:
|
||||
meta_tag = soup.find('meta', attrs={'http-equiv': 'Content-Type'})
|
||||
if meta_tag:
|
||||
_, params = cgi.parse_header(meta_tag.attrs['content'])
|
||||
charset = params.get('charset', charset)
|
||||
parser.feed(resp.data.decode(charset))
|
||||
except Exception as e:
|
||||
if isinstance(e, UnicodeDecodeError) and soup:
|
||||
parsed_title = soup.find('title').text
|
||||
# Suppress Exception due to intentional self.reset() in BHTMLParser
|
||||
if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'):
|
||||
logerr('get_page_title(): %s', e)
|
||||
finally:
|
||||
return re.sub('\s{2,}', ' ', parser.parsed_title)
|
||||
if not parsed_title:
|
||||
parsed_title = parser.parsed_title
|
||||
return re.sub('\s{2,}', ' ', parsed_title)
|
||||
|
||||
|
||||
def gen_headers():
|
||||
|
Loading…
x
Reference in New Issue
Block a user