Merge pull request #338 from ckolumbus/feature/import-firefox-json

Feature/import firefox json
This commit is contained in:
Arun Prakash Jana 2018-12-28 18:05:38 +05:30 committed by GitHub
commit 940d3e1642
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 437 additions and 3 deletions

141
buku
View File

@ -22,6 +22,7 @@ from bs4 import BeautifulSoup
import certifi
import cgi
import collections
from enum import Enum
import json
import logging
import os
@ -2376,6 +2377,24 @@ class BukuDb:
items = import_md(filepath=filepath, newtag=newtag)
elif filepath.endswith('org'):
items = import_org(filepath=filepath, newtag=newtag)
elif filepath.endswith('json'):
if not tacit:
resp = input('Add Bookmark folder name as tag? (y/n): ')
else:
resp = 'y'
add_bookmark_folder_as_tag = (resp == 'y')
try:
with open(filepath, 'r', encoding='utf-8') as datafile:
data = json.load(datafile)
items = import_firefox_json(data, add_bookmark_folder_as_tag, newtag)
except ValueError as e:
logerr("JSON Decode Error: {}".format(e))
return False
except Exception as e:
logerr(e)
return False
else:
try:
@ -2861,6 +2880,128 @@ def import_org(filepath, newtag):
if newtag else None, None, 0, True
)
def import_firefox_json(json, add_bookmark_folder_as_tag=False, unique_tag=None):
"""Open Firefox json export file and import data.
Ignore 'SmartBookmark' and 'Separator' entries.
Needed/used fields out of the JSON schema of the bookmarks:
title : the name/title of the entry
tags : ',' separated tags for the bookmark entry
typeCode : 1 - uri, 2 - subfolder, 3 - separator
annos/{name,value} : following annotation entries are used
name : Places/SmartBookmark : identifies smart folder, ignored
name : bookmarkPropereties/description : detailed bookmark entry description
children : for subfolders, recurse into the child entries
Parameters
----------
path : str
Path to Firefox json bookmarks file.
unique_tag : str
Timestamp tag in YYYYMonDD format.
add_bookmark_folder_as_tag : bool
True if bookmark parent folder should be added as tags else False.
"""
class TypeCode(Enum):
""" Format
typeCode
1 : uri (type=text/x-moz-place)
2 : subfolder (type=text/x-moz-container)
3 : separator (type=text/x-moz-separator)
"""
uri = 1
folder = 2
separator = 3
def is_smart(entry):
result = False
try:
d = [anno for anno in entry['annos'] if anno['name'] == "Places/SmartBookmark"]
result = bool(len(d))
except Exception:
result = False
return result
def extract_desc(entry):
try:
d = [anno for anno in entry['annos'] if anno['name'] == "bookmarkProperties/description"]
return d[0]['value']
except Exception:
logdbg("no description found for entry: {} {}".format(entry['uri'], entry['title']))
return ""
def extract_tags(entry):
tags = []
try:
tags = entry['tags'].split(',')
except Exception:
logdbg("no tags found for entry: {} {}".format(entry['uri'], entry['title']))
return tags
def iterate_children(parent_folder, entry_list):
for bm_entry in entry_list:
try:
typeCode = bm_entry['typeCode']
except Exception:
logdbg("item without typeCode found, ignoring: {}".format(bm_entry['title']))
continue
if TypeCode.uri.value == typeCode:
try:
if is_smart(bm_entry):
logdbg("SmartBookmark found,m ignoring: {}".format(bm_entry['title']))
continue
if is_nongeneric_url(bm_entry['uri']):
logdbg("Non-Generic URL found,m ignoring: {}".format(bm_entry['title']))
continue
desc = extract_desc(bm_entry)
bookmark_tags = extract_tags(bm_entry)
if add_bookmark_folder_as_tag:
bookmark_tags.append(parent_folder)
if unique_tag:
bookmark_tags.append(unique_tag)
formatted_tags = [DELIM + tag for tag in bookmark_tags]
tags = parse_tags(formatted_tags)
logdbg("Entry found: {}, {}, {}, {} ".format(bm_entry['uri'], bm_entry['title'], tags, desc))
yield (bm_entry['uri'], bm_entry['title'], tags, desc, 0, True)
except Exception as e:
logerr(e)
elif TypeCode.folder.value == typeCode:
try:
# from python 3.3 on:
# yield from iterate_children(bm_entry['title'], bm_entry['children'])
for entry in iterate_children(parent_folder+"/"+bm_entry['title'], bm_entry['children']):
yield entry
except Exception as e:
# if any of the properties does not exist, bail out silently
logerr(e)
elif TypeCode.separator.value == typeCode:
logdbg("Unknonw typeCode found : {}".format(typeCode))
try:
entry_list = json['children']
except Exception:
logerr("No children in Root entry found")
return []
yield from iterate_children("", entry_list)
def import_html(html_soup, add_parent_folder_as_tag, newtag):
"""Parse bookmark html.

View File

@ -557,11 +557,10 @@ def test_sigint_handler(capsys):
[
'http://www.kadrof.ru/cat_exchange.shtml',
(
'Все биржи фриланса и удаленной работы - больше 110 сайтов | Kadrof.ru',
'Все биржи фриланса и удаленной работы - больше 110 сайтов для фрилансеров | Kadrof.ru',
'Здесь собраны самые популярные биржи удаленной работы и фриланса для новичков и опытных специалистов. '
'Более 110 ресурсов по видам:',
'биржи удаленной работы,биржи фриланс',
0, 0
'биржи удаленной работы,биржи фриланс', 0, 0
)
],
]

View File

@ -0,0 +1,294 @@
import json
from buku import import_firefox_json
def test_load_from_empty():
"""test method."""
# Arrange
data = json.loads("{}")
# Act
items = import_firefox_json(data)
# Assert
count = sum(1 for _ in items)
assert 0 == count
def test_load_full_entry():
"""test method."""
# Arrange
data = json.loads("""
{
"title" : "title",
"children": [
{
"dateAdded": 1269200039653000,
"guid": "xxxydfalkj",
"id": 113,
"index": 0,
"lastModified": 1305978154986000,
"title": "title",
"type": "text/x-moz-place",
"typeCode": 1,
"tags" : "x,y",
"uri": "http://uri.com/abc?234&536",
"annos" : [{
"name": "bookmarkProperties/description",
"value": "desc"
}]
}]
}""")
# Act
items = import_firefox_json(data)
# Assert
result = []
for item in items:
result.append(item)
assert 1 == len(result)
assert 'http://uri.com/abc?234&536' == result[0][0]
assert 'title' == result[0][1]
assert ',x,y,' == result[0][2]
assert 'desc' == result[0][3]
def test_load_no_typecode():
"""test method."""
# Arrange
data = json.loads("""
{
"title" : "title",
"children": [
{
"title" : "title1",
"uri" : "http://uri1",
"annos" : [{
"name": "bookmarkProperties/description",
"value": "desc"
}]
}]
}""")
# Act
items = import_firefox_json(data)
# Assert
result = []
for item in items:
result.append(item)
assert 0 == len(result)
def test_load_invalid_typecode():
"""test method."""
# Arrange
data = json.loads("""
{
"title" : "title",
"children": [
{
"title" : "title1",
"typeCode" : 99,
"uri" : "http://uri1",
"annos" : [{
"name": "bookmarkProperties/description",
"value": "desc"
}]
}]
}""")
# Act
items = import_firefox_json(data)
# Assert
result = []
for item in items:
result.append(item)
assert 0 == len(result)
def test_load_one_child():
"""test method."""
# Arrange
data = json.loads("""
{
"title" : "title",
"typeCode" : 2,
"children": [
{
"title" : "title1",
"typeCode" : 1,
"uri" : "http://uri1",
"annos" : [{
"name": "bookmarkProperties/description",
"value": "desc"
}]
}
]
} """)
# Act
items = import_firefox_json(data)
# Assert
result = []
for item in items:
result.append(item)
assert 1 == len(result)
assert 'http://uri1' == result[0][0]
assert 'title1' == result[0][1]
assert ',' == result[0][2]
assert 'desc' == result[0][3]
def test_load_one_container_child():
"""test method."""
# Arrange
data = json.loads("""
{
"title" : "title",
"typeCode" : 2,
"children": [
{
"title":"bookmark folder",
"typeCode":2
} ]
}""")
# Act
items = import_firefox_json(data)
# Assert
result = []
for item in items:
result.append(item)
assert 0 == len(result)
def test_load_many_children():
"""test method."""
# Arrange
data = json.loads("""
{
"title":"Weitere Lesezeichen",
"typeCode":2,
"children": [
{"title":"title1","typeCode":1,"uri":"http://uri1.com/#more-74"},
{"title":"title2","typeCode":1,"uri":"http://uri2.com/xyz"},
{"title":"title3","typeCode":1,"uri":"http://uri3.com"}
]
} """)
# Act
items = import_firefox_json(data)
# Assert
result = []
for item in items:
result.append(item)
assert 3 == len(result)
def test_load_hierarchical_container():
"""test method."""
# Arrange
data = json.loads("""
{
"title" : "title",
"typeCode" : 2,
"children": [
{
"title" : "title",
"typeCode" : 2,
"children": [
{"title":"title1","typeCode":1,"uri":"http://uri1.com/#more-74"},
{"title":"title2","typeCode":1,"uri":"http://uri2.com/xyz"},
{"title":"title3","typeCode":1,"uri":"http://uri3.com"}
]
},
{"title":"title4","typeCode":1,"uri":"http://uri4.com/#more-74"},
{"title":"title5","typeCode":1,"uri":"http://uri5.com/xyz"},
{"title":"title6","typeCode":1,"uri":"http://uri6.com"}
]
}
""")
# Act
items = import_firefox_json(data)
# Assert
result = []
for item in items:
result.append(item)
assert 6 == len(result)
assert 'http://uri1.com/#more-74' == result[0][0]
assert 'http://uri2.com/xyz' == result[1][0]
assert 'http://uri3.com' == result[2][0]
assert 'http://uri4.com/#more-74' == result[3][0]
assert 'http://uri5.com/xyz' == result[4][0]
assert 'http://uri6.com' == result[5][0]
def test_load_separator():
"""test method."""
# Arrange
data = json.loads("""
{
"title" : "title",
"typeCode" : 2,
"children": [
{
"title": "",
"type": "text/x-moz-place-separator",
"typeCode": 3
} ]
}""")
# Act
items = import_firefox_json(data)
# Assert
result = []
for item in items:
result.append(item)
assert 0 == len(result)
def test_load_multiple_tags():
"""test method."""
# Arrange
data = json.loads("""
{
"title" : "title",
"children": [
{
"title" : "title1",
"uri" : "http://uri1",
"tags" : "tag1, tag2",
"typeCode": 1,
"annos" : [{
"name": "bookmarkProperties/description",
"value": "desc"
}]
}]
}""")
# Act
items = import_firefox_json(data)
# Assert
result = []
for item in items:
result.append(item)
assert 1 == len(result)
assert ",tag1,tag2," == result[0][2]