Fix #300: detect non-tag keywords
This commit is contained in:
parent
c0400c72c3
commit
99b82c3055
1
buku.1
1
buku.1
@ -43,6 +43,7 @@ Bookmarks with immutable titles are listed with '(L)' after the title.
|
||||
.IP 5. 4
|
||||
\fBTags\fR:
|
||||
- Comma (',') is the tag delimiter in DB. A tag cannot have comma(s) in it. Tags are filtered (for unique tags) and sorted. Tags are stored in lower case and can be replaced, appended or deleted.
|
||||
- Page keywords with full stops or having more than 3 words without a comma or having a word to comma ratio > 3 are appended to description rather than tags.
|
||||
- Parent folder (and subfolder) names are converted to all-lowercase tags during bookmarks html import.
|
||||
- Releases prior to v2.7 support both capital and lower cases in tags. From v2.7 all tags are stored in lowercase. An undocumented option --\fIfixtags\fR is introduced to modify the older tags. It also fixes another issue where the same tag appears multiple times in the tagset of a record. Run \fBbuku --fixtags\fR once.
|
||||
- Tags can be edited from the prompt very easily using '>>' (append), '>' (overwrite) and '<<' (remove) symbols. The LHS of the operands denotes the indices and ranges of tags to apply (as listed by --tag or key 't' at prompt) and the RHS denotes the actual DB indices and ranges of the bookmarks to apply the change to.
|
||||
|
53
buku.py
53
buku.py
@ -2931,6 +2931,41 @@ def is_ignored_mime(url):
|
||||
return False
|
||||
|
||||
|
||||
def is_unusual_tag(tagstr):
|
||||
"""Identify unusual tags. Criteria:
|
||||
- a full stop found
|
||||
- more than 3 words without any commas
|
||||
- word to comma ratio is greater than 3
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tagstr : str
|
||||
tag string to check.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if valid tag else False.
|
||||
"""
|
||||
|
||||
if not tagstr:
|
||||
return False
|
||||
|
||||
if tagstr.find('.') != -1:
|
||||
return True
|
||||
|
||||
nwords = len(tagstr.split())
|
||||
ncommas = tagstr.count(',')
|
||||
|
||||
if nwords > 3 and ncommas == 0:
|
||||
return True
|
||||
|
||||
if ncommas and (nwords / ncommas) > 3:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def parse_decoded_page(page):
|
||||
"""Fetch title, description and keywords from decoded html page.
|
||||
|
||||
@ -2953,6 +2988,8 @@ def parse_decoded_page(page):
|
||||
|
||||
try:
|
||||
title = soup.find('title').text.strip().replace('\n', ' ')
|
||||
if title:
|
||||
title = re.sub('\s{2,}', ' ', title)
|
||||
except Exception as e:
|
||||
logdbg(e)
|
||||
|
||||
@ -2967,6 +3004,8 @@ def parse_decoded_page(page):
|
||||
try:
|
||||
if description:
|
||||
desc = description.get('content').strip()
|
||||
if desc:
|
||||
desc = re.sub('\s{2,}', ' ', desc)
|
||||
except Exception as e:
|
||||
logdbg(e)
|
||||
|
||||
@ -2974,6 +3013,15 @@ def parse_decoded_page(page):
|
||||
try:
|
||||
if keywords:
|
||||
keys = keywords.get('content').strip().replace('\n', ' ')
|
||||
if (is_unusual_tag(keys)):
|
||||
keys = re.sub('\s{2,}', ' ', keys)
|
||||
logdbg('keywords to description: %s', keys)
|
||||
if desc:
|
||||
desc = desc + '\n\n## ' + keys
|
||||
else:
|
||||
desc = '* ' + keys
|
||||
|
||||
keys = None
|
||||
except Exception as e:
|
||||
logdbg(e)
|
||||
|
||||
@ -3025,11 +3073,6 @@ def get_data_from_page(resp):
|
||||
else:
|
||||
title, desc, keywords = parse_decoded_page(resp.data.decode(errors='replace'))
|
||||
|
||||
if title is not None:
|
||||
title = re.sub('\s{2,}', ' ', title)
|
||||
if desc is not None:
|
||||
desc = re.sub('\s{2,}', ' ', desc)
|
||||
|
||||
return (title, desc, keywords)
|
||||
except Exception as e:
|
||||
logerr(e)
|
||||
|
Loading…
Reference in New Issue
Block a user