Merge pull request #338 from ckolumbus/feature/import-firefox-json

Feature/import firefox json
2018-12-28 18:05:38 +05:30 · 2018-12-28 18:05:38 +05:30 · 940d3e1642
commit 940d3e1642
parent 7232ff85a6 be551c60f8
3 changed files with 437 additions and 3 deletions
--- a/141
+++ b/141
@ -22,6 +22,7 @@ from bs4 import BeautifulSoup
 import certifi
 import cgi
 import collections
+from enum import Enum
 import json
 import logging
 import os
@ -2376,6 +2377,24 @@ class BukuDb:
            items = import_md(filepath=filepath, newtag=newtag)
        elif filepath.endswith('org'):
            items = import_org(filepath=filepath, newtag=newtag)
+        elif filepath.endswith('json'):
+            if not tacit:
+                resp = input('Add Bookmark folder name as tag? (y/n): ')
+            else:
+                resp = 'y'
+            add_bookmark_folder_as_tag = (resp == 'y')
+            try:
+                with open(filepath, 'r', encoding='utf-8') as datafile:
+                    data = json.load(datafile)
+
+                items = import_firefox_json(data, add_bookmark_folder_as_tag, newtag)
+
+            except ValueError as e:
+                logerr("JSON Decode Error: {}".format(e))
+                return False
+            except Exception as e:
+                logerr(e)
+                return False

        else:
            try:
@ -2861,6 +2880,128 @@ def import_org(filepath, newtag):
                        if newtag else None, None, 0, True
                    )

+def import_firefox_json(json, add_bookmark_folder_as_tag=False, unique_tag=None):
+    """Open Firefox json export file and import data.
+
+    Ignore 'SmartBookmark'  and 'Separator'  entries.
+
+
+    Needed/used fields out of the JSON schema of the bookmarks:
+
+    title              : the name/title of the entry
+    tags               : ',' separated tags for the bookmark entry
+    typeCode           : 1 - uri, 2 - subfolder, 3 - separator
+    annos/{name,value} : following annotation entries are used
+        name : Places/SmartBookmark            : identifies smart folder, ignored
+        name : bookmarkPropereties/description :  detailed bookmark entry description
+    children           : for subfolders, recurse into the child entries
+
+    Parameters
+    ----------
+    path : str
+        Path to Firefox json bookmarks file.
+    unique_tag : str
+        Timestamp tag in YYYYMonDD format.
+    add_bookmark_folder_as_tag : bool
+        True if bookmark parent folder should be added as tags else False.
+    """
+
+    class TypeCode(Enum):
+        """ Format
+            typeCode
+                1 : uri        (type=text/x-moz-place)
+                2 : subfolder  (type=text/x-moz-container)
+                3 : separator  (type=text/x-moz-separator)
+        """
+        uri = 1
+        folder = 2
+        separator = 3
+
+    def is_smart(entry):
+        result = False
+        try:
+            d = [anno for anno in entry['annos'] if anno['name'] == "Places/SmartBookmark"]
+            result = bool(len(d))
+        except Exception:
+            result = False
+
+        return result
+
+    def extract_desc(entry):
+        try:
+            d = [anno for anno in entry['annos'] if anno['name'] == "bookmarkProperties/description"]
+            return d[0]['value']
+        except Exception:
+            logdbg("no description found for entry: {} {}".format(entry['uri'], entry['title']))
+            return ""
+
+    def extract_tags(entry):
+        tags = []
+        try:
+            tags = entry['tags'].split(',')
+        except Exception:
+            logdbg("no tags found for entry: {} {}".format(entry['uri'], entry['title']))
+
+        return tags
+
+    def iterate_children(parent_folder, entry_list):
+        for bm_entry in entry_list:
+            try:
+                typeCode = bm_entry['typeCode']
+            except Exception:
+                logdbg("item without typeCode found, ignoring: {}".format(bm_entry['title']))
+                continue
+
+            if TypeCode.uri.value == typeCode:
+                try:
+                    if is_smart(bm_entry):
+                        logdbg("SmartBookmark found,m ignoring: {}".format(bm_entry['title']))
+                        continue
+
+                    if is_nongeneric_url(bm_entry['uri']):
+                        logdbg("Non-Generic URL found,m ignoring: {}".format(bm_entry['title']))
+                        continue
+
+                    desc = extract_desc(bm_entry)
+                    bookmark_tags = extract_tags(bm_entry)
+
+                    if add_bookmark_folder_as_tag:
+                        bookmark_tags.append(parent_folder)
+
+                    if unique_tag:
+                        bookmark_tags.append(unique_tag)
+
+                    formatted_tags = [DELIM + tag for tag in bookmark_tags]
+                    tags = parse_tags(formatted_tags)
+
+                    logdbg("Entry found: {}, {}, {}, {} ".format(bm_entry['uri'], bm_entry['title'], tags, desc))
+                    yield (bm_entry['uri'], bm_entry['title'], tags, desc, 0, True)
+
+                except Exception as e:
+                    logerr(e)
+
+            elif TypeCode.folder.value == typeCode:
+                try:
+                    # from python 3.3 on:
+                    # yield from iterate_children(bm_entry['title'], bm_entry['children'])
+
+                    for entry in iterate_children(parent_folder+"/"+bm_entry['title'], bm_entry['children']):
+                        yield entry
+                except Exception as e:
+                    # if any of the properties does not exist, bail out silently
+                    logerr(e)
+
+            elif TypeCode.separator.value == typeCode:
+                logdbg("Unknonw typeCode found : {}".format(typeCode))
+
+    try:
+        entry_list = json['children']
+    except Exception:
+        logerr("No children in Root entry found")
+        return []
+
+    yield from iterate_children("", entry_list)
+

 def import_html(html_soup, add_parent_folder_as_tag, newtag):
    """Parse bookmark html.
--- a/tests/test_buku.py
+++ b/tests/test_buku.py
@ -557,11 +557,10 @@ def test_sigint_handler(capsys):
        [
            'http://www.kadrof.ru/cat_exchange.shtml',
            (
-                'Все биржи фриланса и удаленной работы - больше 110 сайтов | Kadrof.ru',
+                'Все биржи фриланса и удаленной работы - больше 110 сайтов для фрилансеров | Kadrof.ru',
                'Здесь собраны самые популярные биржи удаленной работы и фриланса для новичков и опытных специалистов. '
                'Более 110 ресурсов по видам:',
-                'биржи удаленной работы,биржи фриланс',
-                0, 0
+                'биржи удаленной работы,биржи фриланс', 0, 0
            )
        ],
    ]
--- a/tests/test_import_firefox_json.py
+++ b/tests/test_import_firefox_json.py
@ -0,0 +1,294 @@
+import json
+from buku import import_firefox_json
+
+
+def test_load_from_empty():
+    """test method."""
+    # Arrange
+    data = json.loads("{}")
+
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    count = sum(1 for _ in items)
+    assert 0 == count
+
+def test_load_full_entry():
+    """test method."""
+
+    # Arrange
+    data = json.loads("""
+        {
+            "title" : "title",
+            "children": [
+                {
+                  "dateAdded": 1269200039653000,
+                  "guid": "xxxydfalkj",
+                  "id": 113,
+                  "index": 0,
+                  "lastModified": 1305978154986000,
+                  "title": "title",
+                  "type": "text/x-moz-place",
+                  "typeCode": 1,
+                  "tags" : "x,y",
+                  "uri": "http://uri.com/abc?234&536",
+                  "annos" : [{
+                         "name": "bookmarkProperties/description",
+                         "value": "desc"
+                   }]
+                }]
+        }""")
+
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    result = []
+    for item in items:
+        result.append(item)
+
+    assert 1 == len(result)
+    assert 'http://uri.com/abc?234&536' == result[0][0]
+    assert 'title' == result[0][1]
+    assert ',x,y,' == result[0][2]
+    assert 'desc' == result[0][3]
+
+
+def test_load_no_typecode():
+    """test method."""
+    # Arrange
+    data = json.loads("""
+        {
+            "title" : "title",
+            "children": [
+                {
+                    "title" : "title1",
+                    "uri" : "http://uri1",
+                    "annos" : [{
+                         "name": "bookmarkProperties/description",
+                         "value": "desc"
+                     }]
+                }]
+        }""")
+
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    result = []
+    for item in items:
+        result.append(item)
+
+    assert 0 == len(result)
+
+
+def test_load_invalid_typecode():
+    """test method."""
+    # Arrange
+    data = json.loads("""
+        {
+            "title" : "title",
+            "children": [
+                {
+                    "title" : "title1",
+                    "typeCode" : 99,
+                    "uri" : "http://uri1",
+                    "annos" : [{
+                         "name": "bookmarkProperties/description",
+                         "value": "desc"
+                     }]
+                }]
+        }""")
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    result = []
+    for item in items:
+        result.append(item)
+
+    assert 0 == len(result)
+
+
+def test_load_one_child():
+    """test method."""
+
+    # Arrange
+    data = json.loads("""
+        {
+            "title" : "title",
+            "typeCode" : 2,
+            "children": [
+                {
+                    "title" : "title1",
+                    "typeCode" : 1,
+                    "uri" : "http://uri1",
+                    "annos" : [{
+                         "name": "bookmarkProperties/description",
+                         "value": "desc"
+                      }]
+                 }
+            ]
+        } """)
+
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    result = []
+    for item in items:
+        result.append(item)
+
+    assert 1 == len(result)
+    assert 'http://uri1' == result[0][0]
+    assert 'title1' == result[0][1]
+    assert ',' == result[0][2]
+    assert 'desc' == result[0][3]
+
+def test_load_one_container_child():
+    """test method."""
+
+    # Arrange
+    data = json.loads("""
+        {
+            "title" : "title",
+            "typeCode" : 2,
+            "children": [
+                {
+                    "title":"bookmark folder",
+                    "typeCode":2
+                } ]
+         }""")
+
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    result = []
+    for item in items:
+        result.append(item)
+
+    assert 0 == len(result)
+
+def test_load_many_children():
+    """test method."""
+
+    # Arrange
+    data = json.loads("""
+            {
+                "title":"Weitere Lesezeichen",
+                "typeCode":2,
+                "children": [
+                    {"title":"title1","typeCode":1,"uri":"http://uri1.com/#more-74"},
+                    {"title":"title2","typeCode":1,"uri":"http://uri2.com/xyz"},
+                    {"title":"title3","typeCode":1,"uri":"http://uri3.com"}
+                ]
+            } """)
+
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    result = []
+    for item in items:
+        result.append(item)
+
+    assert 3 == len(result)
+
+def test_load_hierarchical_container():
+    """test method."""
+
+    # Arrange
+    data = json.loads("""
+        {
+            "title" : "title",
+            "typeCode" : 2,
+            "children": [
+                    {
+                        "title" : "title",
+                        "typeCode" : 2,
+                        "children": [
+                            {"title":"title1","typeCode":1,"uri":"http://uri1.com/#more-74"},
+                            {"title":"title2","typeCode":1,"uri":"http://uri2.com/xyz"},
+                            {"title":"title3","typeCode":1,"uri":"http://uri3.com"}
+                        ]
+                    },
+                    {"title":"title4","typeCode":1,"uri":"http://uri4.com/#more-74"},
+                    {"title":"title5","typeCode":1,"uri":"http://uri5.com/xyz"},
+                    {"title":"title6","typeCode":1,"uri":"http://uri6.com"}
+            ]
+        }
+                      """)
+
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    result = []
+    for item in items:
+        result.append(item)
+
+    assert 6 == len(result)
+    assert 'http://uri1.com/#more-74' == result[0][0]
+    assert 'http://uri2.com/xyz' == result[1][0]
+    assert 'http://uri3.com' == result[2][0]
+    assert 'http://uri4.com/#more-74' == result[3][0]
+    assert 'http://uri5.com/xyz' == result[4][0]
+    assert 'http://uri6.com' == result[5][0]
+
+def test_load_separator():
+    """test method."""
+
+    # Arrange
+    data = json.loads("""
+        {
+            "title" : "title",
+            "typeCode" : 2,
+            "children": [
+                {
+                  "title": "",
+                  "type": "text/x-moz-place-separator",
+                  "typeCode": 3
+                } ]
+         }""")
+
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    result = []
+    for item in items:
+        result.append(item)
+
+    assert 0 == len(result)
+
+def test_load_multiple_tags():
+    """test method."""
+    # Arrange
+    data = json.loads("""
+        {
+            "title" : "title",
+            "children": [
+                {
+                    "title" : "title1",
+                    "uri" : "http://uri1",
+                    "tags" : "tag1, tag2",
+                    "typeCode": 1,
+                    "annos" : [{
+                         "name": "bookmarkProperties/description",
+                         "value": "desc"
+                     }]
+                }]
+        }""")
+
+    # Act
+    items = import_firefox_json(data)
+
+    # Assert
+    result = []
+    for item in items:
+        result.append(item)
+
+    assert 1 == len(result)
+    assert ",tag1,tag2," == result[0][2]