Fix #300: detect non-tag keywords

2018-08-06 21:13:02 +05:30 · 2018-08-06 21:13:02 +05:30 · 99b82c3055
commit 99b82c3055
parent c0400c72c3
2 changed files with 49 additions and 5 deletions
--- a/buku.1
+++ b/buku.1
@ -43,6 +43,7 @@ Bookmarks with immutable titles are listed with '(L)' after the title.
 .IP 5. 4
 \fBTags\fR:
  - Comma (',') is the tag delimiter in DB. A tag cannot have comma(s) in it. Tags are filtered (for unique tags) and sorted. Tags are stored in lower case and can be replaced, appended or deleted.
  - Page keywords with full stops or having more than 3 words without a comma or having a word to comma ratio > 3 are appended to description rather than tags.
  - Parent folder (and subfolder) names are converted to all-lowercase tags during bookmarks html import.
  - Releases prior to v2.7 support both capital and lower cases in tags. From v2.7 all tags are stored in lowercase. An undocumented option --\fIfixtags\fR is introduced to modify the older tags. It also fixes another issue where the same tag appears multiple times in the tagset of a record. Run \fBbuku --fixtags\fR once.
  - Tags can be edited from the prompt very easily using '>>' (append), '>' (overwrite) and '<<' (remove) symbols. The LHS of the operands denotes the indices and ranges of tags to apply (as listed by --tag or key 't' at prompt) and the RHS denotes the actual DB indices and ranges of the bookmarks to apply the change to.
--- a/buku.py
+++ b/buku.py
@ -2931,6 +2931,41 @@ def is_ignored_mime(url):
    return False
 def is_unusual_tag(tagstr):
    """Identify unusual tags. Criteria:
       - a full stop found
       - more than 3 words without any commas
       - word to comma ratio is greater than 3
    Parameters
    ----------
    tagstr : str
        tag string to check.
    Returns
    -------
    bool
        True if valid tag else False.
    """
    if not tagstr:
        return False
    if tagstr.find('.') != -1:
        return True
    nwords = len(tagstr.split())
    ncommas = tagstr.count(',')
    if nwords > 3 and ncommas == 0:
        return True
    if ncommas and (nwords / ncommas) > 3:
        return True
    return False
 def parse_decoded_page(page):
    """Fetch title, description and keywords from decoded html page.
@ -2953,6 +2988,8 @@ def parse_decoded_page(page):
    try:
        title = soup.find('title').text.strip().replace('\n', ' ')
        if title:
            title = re.sub('\s{2,}', ' ', title)
    except Exception as e:
        logdbg(e)
@ -2967,6 +3004,8 @@ def parse_decoded_page(page):
    try:
        if description:
            desc = description.get('content').strip()
            if desc:
                desc = re.sub('\s{2,}', ' ', desc)
    except Exception as e:
        logdbg(e)
@ -2974,6 +3013,15 @@ def parse_decoded_page(page):
    try:
        if keywords:
            keys = keywords.get('content').strip().replace('\n', ' ')
            if (is_unusual_tag(keys)):
                keys = re.sub('\s{2,}', ' ', keys)
                logdbg('keywords to description: %s', keys)
                if desc:
                    desc = desc + '\n\n## ' + keys
                else:
                    desc = '* ' + keys
                keys = None
    except Exception as e:
        logdbg(e)
@ -3025,11 +3073,6 @@ def get_data_from_page(resp):
        else:
            title, desc, keywords = parse_decoded_page(resp.data.decode(errors='replace'))
        if title is not None:
            title = re.sub('\s{2,}', ' ', title)
        if desc is not None:
            desc = re.sub('\s{2,}', ' ', desc)
        return (title, desc, keywords)
    except Exception as e:
        logerr(e)