Refine tag and desc parse

2018-08-07 06:35:57 +05:30 · 2018-08-07 06:35:57 +05:30 · bf99c98d91
commit bf99c98d91
parent 67e7e329e6
2 changed files with 8 additions and 14 deletions
--- a/buku.1
+++ b/buku.1
@ -43,7 +43,7 @@ Bookmarks with immutable titles are listed with '(L)' after the title.
 .IP 5. 4
 \fBTags\fR:
  - Comma (',') is the tag delimiter in DB. A tag cannot have comma(s) in it. Tags are filtered (for unique tags) and sorted. Tags are stored in lower case and can be replaced, appended or deleted.
-  - Page keywords with full stops or having more than 3 words without a comma or having a word to comma ratio > 3 are appended to description rather than tags.
+  - Page keywords having a word to comma ratio > 3 are appended to description rather than tags.
  - Parent folder (and subfolder) names are converted to all-lowercase tags during bookmarks html import.
  - Releases prior to v2.7 support both capital and lower cases in tags. From v2.7 all tags are stored in lowercase. An undocumented option --\fIfixtags\fR is introduced to modify the older tags. It also fixes another issue where the same tag appears multiple times in the tagset of a record. Run \fBbuku --fixtags\fR once.
  - Tags can be edited from the prompt very easily using '>>' (append), '>' (overwrite) and '<<' (remove) symbols. The LHS of the operands denotes the indices and ranges of tags to apply (as listed by --tag or key 't' at prompt) and the RHS denotes the actual DB indices and ranges of the bookmarks to apply the change to.
--- a/buku.py
+++ b/buku.py
@ -2937,10 +2937,7 @@ def is_ignored_mime(url):


 def is_unusual_tag(tagstr):
-    """Identify unusual tags. Criteria:
-       - a full stop found
-       - more than 3 words without any commas
-       - word to comma ratio is greater than 3
+    """Identify unusual tags with word to comma ratio > 3.

    Parameters
    ----------
@ -2956,9 +2953,6 @@ def is_unusual_tag(tagstr):
    if not tagstr:
        return False

-    if tagstr.find('.') != -1:
-        return True
-
    nwords = len(tagstr.split())
    ncommas = tagstr.count(',') + 1

@ -2995,14 +2989,14 @@ def parse_decoded_page(page):
    except Exception as e:
        logdbg(e)

-    description = (soup.find('meta', attrs={'name':'og:description'}) or
-                   soup.find('meta', attrs={'name':'og:Description'}) or
-                   soup.find('meta', attrs={'property':'og:description'}) or
-                   soup.find('meta', attrs={'property':'og:Description'}) or
+    description = (soup.find('meta', attrs={'name':'description'}) or
+                   soup.find('meta', attrs={'name':'Description'}) or
                   soup.find('meta', attrs={'property':'description'}) or
                   soup.find('meta', attrs={'property':'Description'}) or
-                   soup.find('meta', attrs={'name':'description'}) or
-                   soup.find('meta', attrs={'name':'Description'}))
+                   soup.find('meta', attrs={'name':'og:description'}) or
+                   soup.find('meta', attrs={'name':'og:Description'}) or
+                   soup.find('meta', attrs={'property':'og:description'}) or
+                   soup.find('meta', attrs={'property':'og:Description'}))
    try:
        if description:
            desc = description.get('content').strip()