chg: dev: use bs4 as backup parser

2018-08-01 19:59:33 +08:00 · 2018-08-01 19:59:33 +08:00 · 92c6e5c7c7
commit 92c6e5c7c7
parent 1a0b5bd328
1 changed files with 19 additions and 2 deletions
--- a/buku.py
+++ b/buku.py
@ -2953,20 +2953,37 @@ def get_page_title(resp):

    parser = BukuHTMLParser()
    charset = 'utf-8'
+    soup = None
+    parsed_title = None

    try:
-        charset = 'utf-8'
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(resp.data, 'html.parser')
+    except Exception as e:
+        logerr('get_page_title(): %s', e)
+    try:
+        charset_found = False
        if 'content-type' in resp.headers:
            _, params = cgi.parse_header(resp.headers['content-type'])
            if params.get('charset') is not None:
                charset = params.get('charset')
+                charset_found = True
+        if not charset_found and soup:
+            meta_tag = soup.find('meta', attrs={'http-equiv': 'Content-Type'})
+            if meta_tag:
+                _, params = cgi.parse_header(meta_tag.attrs['content'])
+                charset = params.get('charset', charset)
        parser.feed(resp.data.decode(charset))
    except Exception as e:
+        if isinstance(e, UnicodeDecodeError) and soup:
+            parsed_title = soup.find('title').text
        # Suppress Exception due to intentional self.reset() in BHTMLParser
        if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'):
            logerr('get_page_title(): %s', e)
    finally:
-        return re.sub('\s{2,}', ' ', parser.parsed_title)
+        if not parsed_title:
+            parsed_title = parser.parsed_title
+        return re.sub('\s{2,}', ' ', parsed_title)


 def gen_headers():