refactor import db (#176)

* chg: dev: refactor bookmark html

* chg: dev: refactor bookmark markdown parser

* chg: dev: update refactor

* new: test: import_md

* fix: dev: newtag parameter on import_md

* new: test: import_html

* new: test:  add bs4 as test package

* chg: dev: rename fp to filepath

* new: test: import html and add parent as tag

* new: test: additional test for import_html

* new: test: simple bookmark html

* chg: dev: add round bracket !cosmetic

* chg: dev: maximalize line length !cosmetic
This commit is contained in:
rachmadani haryono 2017-07-22 11:31:29 +08:00 committed by Arun Prakash Jana
parent a9f4cd502a
commit 50e89f58b7
3 changed files with 194 additions and 60 deletions

149
buku.py
View File

@ -329,6 +329,91 @@ class BukuCrypt:
sys.exit(1)
def import_md(filepath, newtag):
'''Parse bookmark markdown file
:param filepath: Markdown file
:param newtag: New tag
:return: a tuple containing parsed result
'''
with open(filepath, mode='r', encoding='utf-8') as infp:
for line in infp:
# Supported markdown format: [title](url)
# Find position of title end, url start delimiter combo
index = line.find('](')
if index != -1:
# Find title start delimiter
title_start_delim = line[:index].find('[')
# Reverse find the url end delimiter
url_end_delim = line[index + 2:].rfind(')')
if title_start_delim != -1 and url_end_delim > 0:
# Parse title
title = line[title_start_delim + 1:index]
# Parse url
url = line[index + 2:index + 2 + url_end_delim]
if (is_nongeneric_url(url)):
continue
yield (
url, title, delim_wrap(newtag)
if newtag else None, None, 0, True
)
def import_html(html_soup, add_parent_folder_as_tag, newtag):
'''Parse bookmark html
:param html_soup: HTML soup of bookmark html
:param add_parent_folder_as_tag: add parent folder as tag
:param newtag: add unique tag
:return: a tuple containing parsed result
'''
# compatibility
soup = html_soup
for tag in soup.findAll('a'):
# Extract comment from <dd> tag
try:
if (is_nongeneric_url(tag['href'])):
continue
except KeyError as e:
continue
desc = None
comment_tag = tag.findNextSibling('dd')
if comment_tag:
desc = comment_tag.find(text=True, recursive=False)
# add parent folder as tag
if add_parent_folder_as_tag:
# could be its folder or not
possible_folder = tag.find_previous('h3')
# get list of tags within that folder
tag_list = tag.parent.parent.find_parent('dl')
if ((possible_folder) and
possible_folder.parent in list(tag_list.parents)):
# then it's the folder of this bookmark
if tag.has_attr('tags'):
tag['tags'] += (DELIM + possible_folder.text)
else:
tag['tags'] = possible_folder.text
# add unique tag if opted
if newtag:
if tag.has_attr('tags'):
tag['tags'] += (DELIM + newtag)
else:
tag['tags'] = newtag
yield (
tag['href'], tag.string, parse_tags([tag['tags']])
if tag.has_attr('tags') else None, desc, 0, True
)
class BukuDb:
'''Abstracts all database operations'''
@ -1617,27 +1702,8 @@ class BukuDb:
newtag = None
if filepath.endswith('.md'):
with open(filepath, mode='r', encoding='utf-8') as infp:
for line in infp:
# Supported markdown format: [title](url)
# Find position of title end, url start delimiter combo
index = line.find('](')
if index != -1:
# Find title start delimiter
title_start_delim = line[:index].find('[')
# Reverse find the url end delimiter
url_end_delim = line[index + 2:].rfind(')')
if title_start_delim != -1 and url_end_delim > 0:
# Parse title
title = line[title_start_delim + 1:index]
# Parse url
url = line[index + 2:index + 2 + url_end_delim]
if (is_nongeneric_url(url)):
continue
self.add_rec(url, title, delim_wrap(newtag)
if newtag else None, None, 0, True)
for item in import_md(filepath=filepath, newtag=newtag):
self.add_rec(*item)
self.conn.commit()
infp.close()
@ -1658,44 +1724,9 @@ class BukuDb:
else:
resp = 'y'
for tag in soup.findAll('a'):
# Extract comment from <dd> tag
try:
if (is_nongeneric_url(tag['href'])):
continue
except KeyError as e:
continue
desc = None
comment_tag = tag.findNextSibling('dd')
if comment_tag:
desc = comment_tag.find(text=True, recursive=False)
# add parent folder as tag
if resp == 'y':
# could be its folder or not
possible_folder = tag.find_previous('h3')
# get list of tags within that folder
tag_list = tag.parent.parent.find_parent('dl')
if ((possible_folder) and
possible_folder.parent in list(tag_list.parents)):
# then it's the folder of this bookmark
if tag.has_attr('tags'):
tag['tags'] += (DELIM + possible_folder.text)
else:
tag['tags'] = possible_folder.text
# add unique tag if opted
if newtag:
if tag.has_attr('tags'):
tag['tags'] += (DELIM + newtag)
else:
tag['tags'] = newtag
self.add_rec(tag['href'], tag.string, parse_tags([tag['tags']])
if tag.has_attr('tags') else None, desc, 0, True)
add_parent_folder_as_tag = (resp == 'y')
for item in import_html(soup, add_parent_folder_as_tag, newtag):
self.add_rec(*item)
self.conn.commit()
infp.close()

View File

@ -16,7 +16,9 @@ with open('README.md', encoding='utf-8') as f:
long_description = f.read()
tests_require = [
'pytest-cov', 'pytest-catchlog', 'hypothesis>=3.7.0', 'pytest>=3.1.2', 'py>=1.4.33'],
'pytest-cov', 'pytest-catchlog', 'hypothesis>=3.7.0', 'pytest>=3.1.2', 'py>=1.4.33',
'beautifulsoup4==4.6.0'
],
setup(
name='buku',

View File

@ -482,3 +482,104 @@ def test_is_nongeneric_url(url, exp_res):
import buku
res = buku.is_nongeneric_url(url)
assert res == exp_res
@pytest.mark.parametrize(
'newtag, exp_res',
[
(None, ('http://example.com', 'text1', None, None, 0, True)),
('tag1',('http://example.com', 'text1', ',tag1,', None, 0, True)),
]
)
def test_import_md(tmpdir, newtag, exp_res):
from buku import import_md
p = tmpdir.mkdir("importmd").join("test.md")
p.write("[text1](http://example.com)")
res = list(import_md(p.strpath, newtag))
assert res[0] == exp_res
@pytest.mark.parametrize(
'html_text, exp_res',
[
(
"""<DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
<DD>comment for the bookmark here
<a> </a>""",
((
'https://github.com/j', 'GitHub', ',tag1,tag2,',
'comment for the bookmark here\n', 0, True
),)
),
(
"""DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
<DD>comment for the bookmark here
<a>second line of the comment here</a>""",
((
'https://github.com/j', 'GitHub', ',tag1,tag2,',
'comment for the bookmark here\n ', 0, True
),)
),
(
"""DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
<DD>comment for the bookmark here
second line of the comment here
third line of the comment here
<DT><A HREF="https://news.com/" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2,tag3">News</A>""",
(
(
'https://github.com/j', 'GitHub', ',tag1,tag2,',
'comment for the bookmark here\n '
'second line of the comment here\n '
'third line of the comment here\n ',
0, True
),
('https://news.com/', 'News', ',tag1,tag2,tag3,', None, 0, True)
)
),
(
"""DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
<DD>comment for the bookmark here""",
((
'https://github.com/j', 'GitHub', ',tag1,tag2,',
'comment for the bookmark here', 0, True
),)
)
]
)
def test_import_html(html_text, exp_res):
"""test method."""
from buku import import_html
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(html_text, 'html.parser')
res = list(import_html(html_soup, False, None))
for item, exp_item in zip(res, exp_res):
assert item == exp_item
def test_import_html_and_add_parent():
from buku import import_html
from bs4 import BeautifulSoup
html_text = """<DT><H3>1s</H3>
<DL><p>
<DT><A HREF="http://example.com/"></A>"""
exp_res = ('http://example.com/', None, ',1s,', None, 0, True)
html_soup = BeautifulSoup(html_text, 'html.parser')
res = list(import_html(html_soup, True, None))
assert res[0] == exp_res
def test_import_html_and_new_tag():
from buku import import_html
from bs4 import BeautifulSoup
html_text = """<DT><A HREF="https://github.com/j" TAGS="tag1,tag2">GitHub</A>
<DD>comment for the bookmark here"""
exp_res = (
'https://github.com/j', 'GitHub', ',tag1,tag2,tag3,',
'comment for the bookmark here', 0, True
)
html_soup = BeautifulSoup(html_text, 'html.parser')
res = list(import_html(html_soup, False, 'tag3'))
assert res[0] == exp_res