Fix #300: detect non-tag keywords

This commit is contained in:
Arun Prakash Jana 2018-08-06 21:13:02 +05:30
parent c0400c72c3
commit 99b82c3055
No known key found for this signature in database
GPG Key ID: A75979F35C080412
2 changed files with 49 additions and 5 deletions

1
buku.1
View File

@ -43,6 +43,7 @@ Bookmarks with immutable titles are listed with '(L)' after the title.
.IP 5. 4
\fBTags\fR:
- Comma (',') is the tag delimiter in DB. A tag cannot have comma(s) in it. Tags are filtered (for unique tags) and sorted. Tags are stored in lower case and can be replaced, appended or deleted.
- Page keywords with full stops or having more than 3 words without a comma or having a word to comma ratio > 3 are appended to description rather than tags.
- Parent folder (and subfolder) names are converted to all-lowercase tags during bookmarks html import.
- Releases prior to v2.7 support both capital and lower cases in tags. From v2.7 all tags are stored in lowercase. An undocumented option --\fIfixtags\fR is introduced to modify the older tags. It also fixes another issue where the same tag appears multiple times in the tagset of a record. Run \fBbuku --fixtags\fR once.
- Tags can be edited from the prompt very easily using '>>' (append), '>' (overwrite) and '<<' (remove) symbols. The LHS of the operands denotes the indices and ranges of tags to apply (as listed by --tag or key 't' at prompt) and the RHS denotes the actual DB indices and ranges of the bookmarks to apply the change to.

53
buku.py
View File

@ -2931,6 +2931,41 @@ def is_ignored_mime(url):
return False
def is_unusual_tag(tagstr):
"""Identify unusual tags. Criteria:
- a full stop found
- more than 3 words without any commas
- word to comma ratio is greater than 3
Parameters
----------
tagstr : str
tag string to check.
Returns
-------
bool
True if valid tag else False.
"""
if not tagstr:
return False
if tagstr.find('.') != -1:
return True
nwords = len(tagstr.split())
ncommas = tagstr.count(',')
if nwords > 3 and ncommas == 0:
return True
if ncommas and (nwords / ncommas) > 3:
return True
return False
def parse_decoded_page(page):
"""Fetch title, description and keywords from decoded html page.
@ -2953,6 +2988,8 @@ def parse_decoded_page(page):
try:
title = soup.find('title').text.strip().replace('\n', ' ')
if title:
title = re.sub('\s{2,}', ' ', title)
except Exception as e:
logdbg(e)
@ -2967,6 +3004,8 @@ def parse_decoded_page(page):
try:
if description:
desc = description.get('content').strip()
if desc:
desc = re.sub('\s{2,}', ' ', desc)
except Exception as e:
logdbg(e)
@ -2974,6 +3013,15 @@ def parse_decoded_page(page):
try:
if keywords:
keys = keywords.get('content').strip().replace('\n', ' ')
if (is_unusual_tag(keys)):
keys = re.sub('\s{2,}', ' ', keys)
logdbg('keywords to description: %s', keys)
if desc:
desc = desc + '\n\n## ' + keys
else:
desc = '* ' + keys
keys = None
except Exception as e:
logdbg(e)
@ -3025,11 +3073,6 @@ def get_data_from_page(resp):
else:
title, desc, keywords = parse_decoded_page(resp.data.decode(errors='replace'))
if title is not None:
title = re.sub('\s{2,}', ' ', title)
if desc is not None:
desc = re.sub('\s{2,}', ' ', desc)
return (title, desc, keywords)
except Exception as e:
logerr(e)