Detect page charset from content-type field
This commit is contained in:
parent
88c4ccbccb
commit
69281444fd
8
buku.py
8
buku.py
@ -18,6 +18,7 @@
|
|||||||
# along with Buku. If not, see <http://www.gnu.org/licenses/>.
|
# along with Buku. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import cgi
|
||||||
import collections
|
import collections
|
||||||
import html.parser as HTMLParser
|
import html.parser as HTMLParser
|
||||||
import json
|
import json
|
||||||
@ -2948,9 +2949,14 @@ def get_page_title(resp):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
parser = BukuHTMLParser()
|
parser = BukuHTMLParser()
|
||||||
|
charset = 'utf-8'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parser.feed(resp.data.decode('utf-8'))
|
if 'content-type' in resp.headers:
|
||||||
|
_, params = cgi.parse_header(resp.headers['content-type'])
|
||||||
|
if 'charset' in params:
|
||||||
|
charset = params['charset']
|
||||||
|
parser.feed(resp.data.decode(charset))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Suppress Exception due to intentional self.reset() in BHTMLParser
|
# Suppress Exception due to intentional self.reset() in BHTMLParser
|
||||||
if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'):
|
if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user