Fix #300: detect non-tag keywords

2018-08-06 21:13:02 +05:30 · 2018-08-06 21:13:02 +05:30 · 99b82c3055
commit 99b82c3055
parent c0400c72c3
2 changed files with 49 additions and 5 deletions
--- a/buku.1
+++ b/buku.1
@ -43,6 +43,7 @@ Bookmarks with immutable titles are listed with '(L)' after the title.
 .IP 5. 4
 \fBTags\fR:
  - Comma (',') is the tag delimiter in DB. A tag cannot have comma(s) in it. Tags are filtered (for unique tags) and sorted. Tags are stored in lower case and can be replaced, appended or deleted.
+  - Page keywords with full stops or having more than 3 words without a comma or having a word to comma ratio > 3 are appended to description rather than tags.
  - Parent folder (and subfolder) names are converted to all-lowercase tags during bookmarks html import.
  - Releases prior to v2.7 support both capital and lower cases in tags. From v2.7 all tags are stored in lowercase. An undocumented option --\fIfixtags\fR is introduced to modify the older tags. It also fixes another issue where the same tag appears multiple times in the tagset of a record. Run \fBbuku --fixtags\fR once.
  - Tags can be edited from the prompt very easily using '>>' (append), '>' (overwrite) and '<<' (remove) symbols. The LHS of the operands denotes the indices and ranges of tags to apply (as listed by --tag or key 't' at prompt) and the RHS denotes the actual DB indices and ranges of the bookmarks to apply the change to.
--- a/buku.py
+++ b/buku.py
@ -2931,6 +2931,41 @@ def is_ignored_mime(url):
    return False


+def is_unusual_tag(tagstr):
+    """Identify unusual tags. Criteria:
+       - a full stop found
+       - more than 3 words without any commas
+       - word to comma ratio is greater than 3
+
+    Parameters
+    ----------
+    tagstr : str
+        tag string to check.
+
+    Returns
+    -------
+    bool
+        True if valid tag else False.
+    """
+
+    if not tagstr:
+        return False
+
+    if tagstr.find('.') != -1:
+        return True
+
+    nwords = len(tagstr.split())
+    ncommas = tagstr.count(',')
+
+    if nwords > 3 and ncommas == 0:
+        return True
+
+    if ncommas and (nwords / ncommas) > 3:
+        return True
+
+    return False
+
+
 def parse_decoded_page(page):
    """Fetch title, description and keywords from decoded html page.

@ -2953,6 +2988,8 @@ def parse_decoded_page(page):

    try:
        title = soup.find('title').text.strip().replace('\n', ' ')
+        if title:
+            title = re.sub('\s{2,}', ' ', title)
    except Exception as e:
        logdbg(e)

@ -2967,6 +3004,8 @@ def parse_decoded_page(page):
    try:
        if description:
            desc = description.get('content').strip()
+            if desc:
+                desc = re.sub('\s{2,}', ' ', desc)
    except Exception as e:
        logdbg(e)

@ -2974,6 +3013,15 @@ def parse_decoded_page(page):
    try:
        if keywords:
            keys = keywords.get('content').strip().replace('\n', ' ')
+            if (is_unusual_tag(keys)):
+                keys = re.sub('\s{2,}', ' ', keys)
+                logdbg('keywords to description: %s', keys)
+                if desc:
+                    desc = desc + '\n\n## ' + keys
+                else:
+                    desc = '* ' + keys
+
+                keys = None
    except Exception as e:
        logdbg(e)

@ -3025,11 +3073,6 @@ def get_data_from_page(resp):
        else:
            title, desc, keywords = parse_decoded_page(resp.data.decode(errors='replace'))

-        if title is not None:
-            title = re.sub('\s{2,}', ' ', title)
-        if desc is not None:
-            desc = re.sub('\s{2,}', ' ', desc)
-
        return (title, desc, keywords)
    except Exception as e:
        logerr(e)