From 8180711a33eacb9bdce0be407102635a170028a4 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Wed, 1 Aug 2018 17:30:46 +0800 Subject: [PATCH 1/7] chg: test: add url for test title fetch --- tests/test_buku.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_buku.py b/tests/test_buku.py index 90d728f..6ea2ec7 100644 --- a/tests/test_buku.py +++ b/tests/test_buku.py @@ -553,6 +553,16 @@ def test_sigint_handler(capsys): ['http://example.com/page1.txt', (('', 1, 0))], ['about:new_page', (('', 0, 1))], ['chrome://version/', (('', 0, 1))], + ['chrome://version/', (('', 0, 1))], + ['http://4pda.ru/forum/index.php?showtopic=182463&st=1640#entry6044923', None], + [ + 'https://www.google.ru/search?' + 'newwindow=1&safe=off&q=xkbcomp+alt+gr&' + 'oq=xkbcomp+alt+gr&' + 'gs_l=serp.3..33i21.28976559.28977886.0.' + '28978017.6.6.0.0.0.0.167.668.0j5.5.0....0...1c.1.64.' + 'serp..1.2.311.06cSKPTLo18', None + ], ] ) def test_network_handler_with_url(url, exp_res): From 496238262f33656f41285eadb309274e1b0fe92e Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Wed, 1 Aug 2018 17:38:43 +0800 Subject: [PATCH 2/7] fix: test: fetch title --- tests/test_buku.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_buku.py b/tests/test_buku.py index 6ea2ec7..e41db6b 100644 --- a/tests/test_buku.py +++ b/tests/test_buku.py @@ -1,6 +1,7 @@ """test module.""" from itertools import product from unittest import mock +from urllib.parse import urlparse import json import os import signal @@ -554,14 +555,18 @@ def test_sigint_handler(capsys): ['about:new_page', (('', 0, 1))], ['chrome://version/', (('', 0, 1))], ['chrome://version/', (('', 0, 1))], - ['http://4pda.ru/forum/index.php?showtopic=182463&st=1640#entry6044923', None], + [ + 'http://4pda.ru/forum/index.php?showtopic=182463&st=1640#entry6044923', + ('Samsung GT-I5800 Galaxy 580 - Обсуждение - 4PDA', 0, 0) + ], [ 'https://www.google.ru/search?' 'newwindow=1&safe=off&q=xkbcomp+alt+gr&' 'oq=xkbcomp+alt+gr&' 'gs_l=serp.3..33i21.28976559.28977886.0.' '28978017.6.6.0.0.0.0.167.668.0j5.5.0....0...1c.1.64.' - 'serp..1.2.311.06cSKPTLo18', None + 'serp..1.2.311.06cSKPTLo18', + ('xkbcomp alt gr', 0, 0) ], ] ) @@ -572,6 +577,8 @@ def test_network_handler_with_url(url, exp_res): buku.urllib3 = urllib3 buku.myproxy = None res = buku.network_handler(url) + if urlparse(url).netloc == 'www.google.ru': + res = (res[0].split(" - ")[0], res[1], res[2]) assert res == exp_res From 73c79beca5f9b8113b5bcba769fdf838c5752543 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Wed, 1 Aug 2018 17:39:16 +0800 Subject: [PATCH 3/7] chg: dev: use utf8 as default decoder --- buku.py | 1 + 1 file changed, 1 insertion(+) diff --git a/buku.py b/buku.py index 7ae30e0..05bf0d8 100755 --- a/buku.py +++ b/buku.py @@ -2955,6 +2955,7 @@ def get_page_title(resp): charset = 'utf-8' try: + charset = 'utf-8' if 'content-type' in resp.headers: _, params = cgi.parse_header(resp.headers['content-type']) if params.get('charset') is not None: From 1a0b5bd328ac4d9260c2ff61d29fc1b86014dabf Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Wed, 1 Aug 2018 18:36:06 +0800 Subject: [PATCH 4/7] new: test: 2 url for testing --- tests/test_buku.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_buku.py b/tests/test_buku.py index e41db6b..728e9d2 100644 --- a/tests/test_buku.py +++ b/tests/test_buku.py @@ -568,6 +568,17 @@ def test_sigint_handler(capsys): 'serp..1.2.311.06cSKPTLo18', ('xkbcomp alt gr', 0, 0) ], + [ + 'http://www.vim.org/scripts/script.php?script_id=4641', + ( + 'mlessnau_case - "in-case" selection, deletion and substitution ' + 'for underscore, camel, mixed case : vim online', 0, 0 + ) + ], + [ + 'http://www.kadrof.ru/cat_exchange.shtml', + ('Все биржи фриланса и удаленной работы - больше 110 сайтов | Kadrof.ru', 0, 0) + ], ] ) def test_network_handler_with_url(url, exp_res): From 92c6e5c7c7ee8b7e00088ae5dfb5f9a76572d19f Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Wed, 1 Aug 2018 19:59:33 +0800 Subject: [PATCH 5/7] chg: dev: use bs4 as backup parser --- buku.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/buku.py b/buku.py index 05bf0d8..1e9375b 100755 --- a/buku.py +++ b/buku.py @@ -2953,20 +2953,37 @@ def get_page_title(resp): parser = BukuHTMLParser() charset = 'utf-8' + soup = None + parsed_title = None try: - charset = 'utf-8' + from bs4 import BeautifulSoup + soup = BeautifulSoup(resp.data, 'html.parser') + except Exception as e: + logerr('get_page_title(): %s', e) + try: + charset_found = False if 'content-type' in resp.headers: _, params = cgi.parse_header(resp.headers['content-type']) if params.get('charset') is not None: charset = params.get('charset') + charset_found = True + if not charset_found and soup: + meta_tag = soup.find('meta', attrs={'http-equiv': 'Content-Type'}) + if meta_tag: + _, params = cgi.parse_header(meta_tag.attrs['content']) + charset = params.get('charset', charset) parser.feed(resp.data.decode(charset)) except Exception as e: + if isinstance(e, UnicodeDecodeError) and soup: + parsed_title = soup.find('title').text # Suppress Exception due to intentional self.reset() in BHTMLParser if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'): logerr('get_page_title(): %s', e) finally: - return re.sub('\s{2,}', ' ', parser.parsed_title) + if not parsed_title: + parsed_title = parser.parsed_title + return re.sub('\s{2,}', ' ', parsed_title) def gen_headers(): From 53e1eb2118101763eb09dd4d67fce6a5a22d68db Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Wed, 1 Aug 2018 18:47:57 +0800 Subject: [PATCH 6/7] Revert "Revert "Revert "Remove redundant message, fix check for 0 records""" This reverts commit edf57738301f17de5d9928d8f961811ad4ba58b9. --- buku.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/buku.py b/buku.py index 1e9375b..e5cd80f 100755 --- a/buku.py +++ b/buku.py @@ -1387,8 +1387,6 @@ class BukuDb: resp = input('Delete these bookmarks? (y/n): ') if resp != 'y': return False - else: - return False query = 'DELETE from bookmarks where id BETWEEN ? AND ?' self.cur.execute(query, (low, high)) @@ -1566,8 +1564,7 @@ class BukuDb: resultset = self.cur.execute(query) else: query = 'SELECT * from bookmarks where id BETWEEN ? AND ?' - self.cur.execute(query, (low, high)) - resultset = self.cur.fetchall() + resultset = self.cur.execute(query, (low, high)) except IndexError: logerr('Index out of range') return False @@ -1593,9 +1590,9 @@ class BukuDb: self.cur.execute('SELECT * FROM bookmarks') resultset = self.cur.fetchall() - if len(resultset) < 1: + if not resultset: logerr('0 records') - return False + return True if not self.json: print_rec_with_filter(resultset, self.field_filter) From b567dd2a643dda09bc20dc24a0539317c7fd34b6 Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Wed, 1 Aug 2018 20:08:30 +0800 Subject: [PATCH 7/7] chg: test: get_page_title func --- tests/test_buku.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_buku.py b/tests/test_buku.py index 728e9d2..4796898 100644 --- a/tests/test_buku.py +++ b/tests/test_buku.py @@ -53,10 +53,11 @@ def test_get_page_title(): """test func.""" resp = mock.Mock() parser = mock.Mock() + parser.parsed_title = 'doubled whitespace' with mock.patch('buku.BukuHTMLParser', return_value=parser): import buku res = buku.get_page_title(resp) - assert res == parser.parsed_title + assert res == 'doubled whitespace' def test_gen_headers():