Refine tag and desc parse

This commit is contained in:
Arun Prakash Jana 2018-08-07 06:35:57 +05:30
parent 67e7e329e6
commit bf99c98d91
No known key found for this signature in database
GPG Key ID: A75979F35C080412
2 changed files with 8 additions and 14 deletions

2
buku.1
View File

@ -43,7 +43,7 @@ Bookmarks with immutable titles are listed with '(L)' after the title.
.IP 5. 4
\fBTags\fR:
- Comma (',') is the tag delimiter in DB. A tag cannot have comma(s) in it. Tags are filtered (for unique tags) and sorted. Tags are stored in lower case and can be replaced, appended or deleted.
- Page keywords with full stops or having more than 3 words without a comma or having a word to comma ratio > 3 are appended to description rather than tags.
- Page keywords having a word to comma ratio > 3 are appended to description rather than tags.
- Parent folder (and subfolder) names are converted to all-lowercase tags during bookmarks html import.
- Releases prior to v2.7 support both capital and lower cases in tags. From v2.7 all tags are stored in lowercase. An undocumented option --\fIfixtags\fR is introduced to modify the older tags. It also fixes another issue where the same tag appears multiple times in the tagset of a record. Run \fBbuku --fixtags\fR once.
- Tags can be edited from the prompt very easily using '>>' (append), '>' (overwrite) and '<<' (remove) symbols. The LHS of the operands denotes the indices and ranges of tags to apply (as listed by --tag or key 't' at prompt) and the RHS denotes the actual DB indices and ranges of the bookmarks to apply the change to.

20
buku.py
View File

@ -2937,10 +2937,7 @@ def is_ignored_mime(url):
def is_unusual_tag(tagstr):
"""Identify unusual tags. Criteria:
- a full stop found
- more than 3 words without any commas
- word to comma ratio is greater than 3
"""Identify unusual tags with word to comma ratio > 3.
Parameters
----------
@ -2956,9 +2953,6 @@ def is_unusual_tag(tagstr):
if not tagstr:
return False
if tagstr.find('.') != -1:
return True
nwords = len(tagstr.split())
ncommas = tagstr.count(',') + 1
@ -2995,14 +2989,14 @@ def parse_decoded_page(page):
except Exception as e:
logdbg(e)
description = (soup.find('meta', attrs={'name':'og:description'}) or
soup.find('meta', attrs={'name':'og:Description'}) or
soup.find('meta', attrs={'property':'og:description'}) or
soup.find('meta', attrs={'property':'og:Description'}) or
description = (soup.find('meta', attrs={'name':'description'}) or
soup.find('meta', attrs={'name':'Description'}) or
soup.find('meta', attrs={'property':'description'}) or
soup.find('meta', attrs={'property':'Description'}) or
soup.find('meta', attrs={'name':'description'}) or
soup.find('meta', attrs={'name':'Description'}))
soup.find('meta', attrs={'name':'og:description'}) or
soup.find('meta', attrs={'name':'og:Description'}) or
soup.find('meta', attrs={'property':'og:description'}) or
soup.find('meta', attrs={'property':'og:Description'}))
try:
if description:
desc = description.get('content').strip()