Refine tag and desc parse
This commit is contained in:
parent
67e7e329e6
commit
bf99c98d91
2
buku.1
2
buku.1
@ -43,7 +43,7 @@ Bookmarks with immutable titles are listed with '(L)' after the title.
|
||||
.IP 5. 4
|
||||
\fBTags\fR:
|
||||
- Comma (',') is the tag delimiter in DB. A tag cannot have comma(s) in it. Tags are filtered (for unique tags) and sorted. Tags are stored in lower case and can be replaced, appended or deleted.
|
||||
- Page keywords with full stops or having more than 3 words without a comma or having a word to comma ratio > 3 are appended to description rather than tags.
|
||||
- Page keywords having a word to comma ratio > 3 are appended to description rather than tags.
|
||||
- Parent folder (and subfolder) names are converted to all-lowercase tags during bookmarks html import.
|
||||
- Releases prior to v2.7 support both capital and lower cases in tags. From v2.7 all tags are stored in lowercase. An undocumented option --\fIfixtags\fR is introduced to modify the older tags. It also fixes another issue where the same tag appears multiple times in the tagset of a record. Run \fBbuku --fixtags\fR once.
|
||||
- Tags can be edited from the prompt very easily using '>>' (append), '>' (overwrite) and '<<' (remove) symbols. The LHS of the operands denotes the indices and ranges of tags to apply (as listed by --tag or key 't' at prompt) and the RHS denotes the actual DB indices and ranges of the bookmarks to apply the change to.
|
||||
|
20
buku.py
20
buku.py
@ -2937,10 +2937,7 @@ def is_ignored_mime(url):
|
||||
|
||||
|
||||
def is_unusual_tag(tagstr):
|
||||
"""Identify unusual tags. Criteria:
|
||||
- a full stop found
|
||||
- more than 3 words without any commas
|
||||
- word to comma ratio is greater than 3
|
||||
"""Identify unusual tags with word to comma ratio > 3.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@ -2956,9 +2953,6 @@ def is_unusual_tag(tagstr):
|
||||
if not tagstr:
|
||||
return False
|
||||
|
||||
if tagstr.find('.') != -1:
|
||||
return True
|
||||
|
||||
nwords = len(tagstr.split())
|
||||
ncommas = tagstr.count(',') + 1
|
||||
|
||||
@ -2995,14 +2989,14 @@ def parse_decoded_page(page):
|
||||
except Exception as e:
|
||||
logdbg(e)
|
||||
|
||||
description = (soup.find('meta', attrs={'name':'og:description'}) or
|
||||
soup.find('meta', attrs={'name':'og:Description'}) or
|
||||
soup.find('meta', attrs={'property':'og:description'}) or
|
||||
soup.find('meta', attrs={'property':'og:Description'}) or
|
||||
description = (soup.find('meta', attrs={'name':'description'}) or
|
||||
soup.find('meta', attrs={'name':'Description'}) or
|
||||
soup.find('meta', attrs={'property':'description'}) or
|
||||
soup.find('meta', attrs={'property':'Description'}) or
|
||||
soup.find('meta', attrs={'name':'description'}) or
|
||||
soup.find('meta', attrs={'name':'Description'}))
|
||||
soup.find('meta', attrs={'name':'og:description'}) or
|
||||
soup.find('meta', attrs={'name':'og:Description'}) or
|
||||
soup.find('meta', attrs={'property':'og:description'}) or
|
||||
soup.find('meta', attrs={'property':'og:Description'}))
|
||||
try:
|
||||
if description:
|
||||
desc = description.get('content').strip()
|
||||
|
Loading…
x
Reference in New Issue
Block a user