Fix #300: detect non-tag keywords
This commit is contained in:
parent
c0400c72c3
commit
99b82c3055
1
buku.1
1
buku.1
@ -43,6 +43,7 @@ Bookmarks with immutable titles are listed with '(L)' after the title.
|
|||||||
.IP 5. 4
|
.IP 5. 4
|
||||||
\fBTags\fR:
|
\fBTags\fR:
|
||||||
- Comma (',') is the tag delimiter in DB. A tag cannot have comma(s) in it. Tags are filtered (for unique tags) and sorted. Tags are stored in lower case and can be replaced, appended or deleted.
|
- Comma (',') is the tag delimiter in DB. A tag cannot have comma(s) in it. Tags are filtered (for unique tags) and sorted. Tags are stored in lower case and can be replaced, appended or deleted.
|
||||||
|
- Page keywords with full stops or having more than 3 words without a comma or having a word to comma ratio > 3 are appended to description rather than tags.
|
||||||
- Parent folder (and subfolder) names are converted to all-lowercase tags during bookmarks html import.
|
- Parent folder (and subfolder) names are converted to all-lowercase tags during bookmarks html import.
|
||||||
- Releases prior to v2.7 support both capital and lower cases in tags. From v2.7 all tags are stored in lowercase. An undocumented option --\fIfixtags\fR is introduced to modify the older tags. It also fixes another issue where the same tag appears multiple times in the tagset of a record. Run \fBbuku --fixtags\fR once.
|
- Releases prior to v2.7 support both capital and lower cases in tags. From v2.7 all tags are stored in lowercase. An undocumented option --\fIfixtags\fR is introduced to modify the older tags. It also fixes another issue where the same tag appears multiple times in the tagset of a record. Run \fBbuku --fixtags\fR once.
|
||||||
- Tags can be edited from the prompt very easily using '>>' (append), '>' (overwrite) and '<<' (remove) symbols. The LHS of the operands denotes the indices and ranges of tags to apply (as listed by --tag or key 't' at prompt) and the RHS denotes the actual DB indices and ranges of the bookmarks to apply the change to.
|
- Tags can be edited from the prompt very easily using '>>' (append), '>' (overwrite) and '<<' (remove) symbols. The LHS of the operands denotes the indices and ranges of tags to apply (as listed by --tag or key 't' at prompt) and the RHS denotes the actual DB indices and ranges of the bookmarks to apply the change to.
|
||||||
|
53
buku.py
53
buku.py
@ -2931,6 +2931,41 @@ def is_ignored_mime(url):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_unusual_tag(tagstr):
|
||||||
|
"""Identify unusual tags. Criteria:
|
||||||
|
- a full stop found
|
||||||
|
- more than 3 words without any commas
|
||||||
|
- word to comma ratio is greater than 3
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tagstr : str
|
||||||
|
tag string to check.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if valid tag else False.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not tagstr:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if tagstr.find('.') != -1:
|
||||||
|
return True
|
||||||
|
|
||||||
|
nwords = len(tagstr.split())
|
||||||
|
ncommas = tagstr.count(',')
|
||||||
|
|
||||||
|
if nwords > 3 and ncommas == 0:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if ncommas and (nwords / ncommas) > 3:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def parse_decoded_page(page):
|
def parse_decoded_page(page):
|
||||||
"""Fetch title, description and keywords from decoded html page.
|
"""Fetch title, description and keywords from decoded html page.
|
||||||
|
|
||||||
@ -2953,6 +2988,8 @@ def parse_decoded_page(page):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
title = soup.find('title').text.strip().replace('\n', ' ')
|
title = soup.find('title').text.strip().replace('\n', ' ')
|
||||||
|
if title:
|
||||||
|
title = re.sub('\s{2,}', ' ', title)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logdbg(e)
|
logdbg(e)
|
||||||
|
|
||||||
@ -2967,6 +3004,8 @@ def parse_decoded_page(page):
|
|||||||
try:
|
try:
|
||||||
if description:
|
if description:
|
||||||
desc = description.get('content').strip()
|
desc = description.get('content').strip()
|
||||||
|
if desc:
|
||||||
|
desc = re.sub('\s{2,}', ' ', desc)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logdbg(e)
|
logdbg(e)
|
||||||
|
|
||||||
@ -2974,6 +3013,15 @@ def parse_decoded_page(page):
|
|||||||
try:
|
try:
|
||||||
if keywords:
|
if keywords:
|
||||||
keys = keywords.get('content').strip().replace('\n', ' ')
|
keys = keywords.get('content').strip().replace('\n', ' ')
|
||||||
|
if (is_unusual_tag(keys)):
|
||||||
|
keys = re.sub('\s{2,}', ' ', keys)
|
||||||
|
logdbg('keywords to description: %s', keys)
|
||||||
|
if desc:
|
||||||
|
desc = desc + '\n\n## ' + keys
|
||||||
|
else:
|
||||||
|
desc = '* ' + keys
|
||||||
|
|
||||||
|
keys = None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logdbg(e)
|
logdbg(e)
|
||||||
|
|
||||||
@ -3025,11 +3073,6 @@ def get_data_from_page(resp):
|
|||||||
else:
|
else:
|
||||||
title, desc, keywords = parse_decoded_page(resp.data.decode(errors='replace'))
|
title, desc, keywords = parse_decoded_page(resp.data.decode(errors='replace'))
|
||||||
|
|
||||||
if title is not None:
|
|
||||||
title = re.sub('\s{2,}', ' ', title)
|
|
||||||
if desc is not None:
|
|
||||||
desc = re.sub('\s{2,}', ' ', desc)
|
|
||||||
|
|
||||||
return (title, desc, keywords)
|
return (title, desc, keywords)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logerr(e)
|
logerr(e)
|
||||||
|
Loading…
Reference in New Issue
Block a user