refactor import db (#176)
* chg: dev: refactor bookmark html * chg: dev: refactor bookmark markdown parser * chg: dev: update refactor * new: test: import_md * fix: dev: newtag parameter on import_md * new: test: import_html * new: test: add bs4 as test package * chg: dev: rename fp to filepath * new: test: import html and add parent as tag * new: test: additional test for import_html * new: test: simple bookmark html * chg: dev: add round bracket !cosmetic * chg: dev: maximalize line length !cosmetic
This commit is contained in:
parent
a9f4cd502a
commit
50e89f58b7
149
buku.py
149
buku.py
@ -329,6 +329,91 @@ class BukuCrypt:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def import_md(filepath, newtag):
|
||||
'''Parse bookmark markdown file
|
||||
|
||||
:param filepath: Markdown file
|
||||
:param newtag: New tag
|
||||
:return: a tuple containing parsed result
|
||||
'''
|
||||
with open(filepath, mode='r', encoding='utf-8') as infp:
|
||||
for line in infp:
|
||||
# Supported markdown format: [title](url)
|
||||
# Find position of title end, url start delimiter combo
|
||||
index = line.find('](')
|
||||
if index != -1:
|
||||
# Find title start delimiter
|
||||
title_start_delim = line[:index].find('[')
|
||||
# Reverse find the url end delimiter
|
||||
url_end_delim = line[index + 2:].rfind(')')
|
||||
|
||||
if title_start_delim != -1 and url_end_delim > 0:
|
||||
# Parse title
|
||||
title = line[title_start_delim + 1:index]
|
||||
# Parse url
|
||||
url = line[index + 2:index + 2 + url_end_delim]
|
||||
if (is_nongeneric_url(url)):
|
||||
continue
|
||||
|
||||
yield (
|
||||
url, title, delim_wrap(newtag)
|
||||
if newtag else None, None, 0, True
|
||||
)
|
||||
|
||||
|
||||
def import_html(html_soup, add_parent_folder_as_tag, newtag):
|
||||
'''Parse bookmark html
|
||||
|
||||
:param html_soup: HTML soup of bookmark html
|
||||
:param add_parent_folder_as_tag: add parent folder as tag
|
||||
:param newtag: add unique tag
|
||||
:return: a tuple containing parsed result
|
||||
'''
|
||||
# compatibility
|
||||
soup = html_soup
|
||||
|
||||
for tag in soup.findAll('a'):
|
||||
# Extract comment from <dd> tag
|
||||
try:
|
||||
if (is_nongeneric_url(tag['href'])):
|
||||
continue
|
||||
except KeyError as e:
|
||||
continue
|
||||
|
||||
desc = None
|
||||
comment_tag = tag.findNextSibling('dd')
|
||||
|
||||
if comment_tag:
|
||||
desc = comment_tag.find(text=True, recursive=False)
|
||||
|
||||
# add parent folder as tag
|
||||
if add_parent_folder_as_tag:
|
||||
# could be its folder or not
|
||||
possible_folder = tag.find_previous('h3')
|
||||
# get list of tags within that folder
|
||||
tag_list = tag.parent.parent.find_parent('dl')
|
||||
|
||||
if ((possible_folder) and
|
||||
possible_folder.parent in list(tag_list.parents)):
|
||||
# then it's the folder of this bookmark
|
||||
if tag.has_attr('tags'):
|
||||
tag['tags'] += (DELIM + possible_folder.text)
|
||||
else:
|
||||
tag['tags'] = possible_folder.text
|
||||
|
||||
# add unique tag if opted
|
||||
if newtag:
|
||||
if tag.has_attr('tags'):
|
||||
tag['tags'] += (DELIM + newtag)
|
||||
else:
|
||||
tag['tags'] = newtag
|
||||
|
||||
yield (
|
||||
tag['href'], tag.string, parse_tags([tag['tags']])
|
||||
if tag.has_attr('tags') else None, desc, 0, True
|
||||
)
|
||||
|
||||
|
||||
class BukuDb:
|
||||
'''Abstracts all database operations'''
|
||||
|
||||
@ -1617,27 +1702,8 @@ class BukuDb:
|
||||
newtag = None
|
||||
|
||||
if filepath.endswith('.md'):
|
||||
with open(filepath, mode='r', encoding='utf-8') as infp:
|
||||
for line in infp:
|
||||
# Supported markdown format: [title](url)
|
||||
# Find position of title end, url start delimiter combo
|
||||
index = line.find('](')
|
||||
if index != -1:
|
||||
# Find title start delimiter
|
||||
title_start_delim = line[:index].find('[')
|
||||
# Reverse find the url end delimiter
|
||||
url_end_delim = line[index + 2:].rfind(')')
|
||||
|
||||
if title_start_delim != -1 and url_end_delim > 0:
|
||||
# Parse title
|
||||
title = line[title_start_delim + 1:index]
|
||||
# Parse url
|
||||
url = line[index + 2:index + 2 + url_end_delim]
|
||||
if (is_nongeneric_url(url)):
|
||||
continue
|
||||
|
||||
self.add_rec(url, title, delim_wrap(newtag)
|
||||
if newtag else None, None, 0, True)
|
||||
for item in import_md(filepath=filepath, newtag=newtag):
|
||||
self.add_rec(*item)
|
||||
|
||||
self.conn.commit()
|
||||
infp.close()
|
||||
@ -1658,44 +1724,9 @@ class BukuDb:
|
||||
else:
|
||||
resp = 'y'
|
||||
|
||||
for tag in soup.findAll('a'):
|
||||
# Extract comment from <dd> tag
|
||||
try:
|
||||
if (is_nongeneric_url(tag['href'])):
|
||||
continue
|
||||
except KeyError as e:
|
||||
continue
|
||||
|
||||
desc = None
|
||||
comment_tag = tag.findNextSibling('dd')
|
||||
|
||||
if comment_tag:
|
||||
desc = comment_tag.find(text=True, recursive=False)
|
||||
|
||||
# add parent folder as tag
|
||||
if resp == 'y':
|
||||
# could be its folder or not
|
||||
possible_folder = tag.find_previous('h3')
|
||||
# get list of tags within that folder
|
||||
tag_list = tag.parent.parent.find_parent('dl')
|
||||
|
||||
if ((possible_folder) and
|
||||
possible_folder.parent in list(tag_list.parents)):
|
||||
# then it's the folder of this bookmark
|
||||
if tag.has_attr('tags'):
|
||||
tag['tags'] += (DELIM + possible_folder.text)
|
||||
else:
|
||||
tag['tags'] = possible_folder.text
|
||||
|
||||
# add unique tag if opted
|
||||
if newtag:
|
||||
if tag.has_attr('tags'):
|
||||
tag['tags'] += (DELIM + newtag)
|
||||
else:
|
||||
tag['tags'] = newtag
|
||||
|
||||
self.add_rec(tag['href'], tag.string, parse_tags([tag['tags']])
|
||||
if tag.has_attr('tags') else None, desc, 0, True)
|
||||
add_parent_folder_as_tag = (resp == 'y')
|
||||
for item in import_html(soup, add_parent_folder_as_tag, newtag):
|
||||
self.add_rec(*item)
|
||||
|
||||
self.conn.commit()
|
||||
infp.close()
|
||||
|
4
setup.py
4
setup.py
@ -16,7 +16,9 @@ with open('README.md', encoding='utf-8') as f:
|
||||
long_description = f.read()
|
||||
|
||||
tests_require = [
|
||||
'pytest-cov', 'pytest-catchlog', 'hypothesis>=3.7.0', 'pytest>=3.1.2', 'py>=1.4.33'],
|
||||
'pytest-cov', 'pytest-catchlog', 'hypothesis>=3.7.0', 'pytest>=3.1.2', 'py>=1.4.33',
|
||||
'beautifulsoup4==4.6.0'
|
||||
],
|
||||
|
||||
setup(
|
||||
name='buku',
|
||||
|
@ -482,3 +482,104 @@ def test_is_nongeneric_url(url, exp_res):
|
||||
import buku
|
||||
res = buku.is_nongeneric_url(url)
|
||||
assert res == exp_res
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'newtag, exp_res',
|
||||
[
|
||||
(None, ('http://example.com', 'text1', None, None, 0, True)),
|
||||
('tag1',('http://example.com', 'text1', ',tag1,', None, 0, True)),
|
||||
]
|
||||
)
|
||||
def test_import_md(tmpdir, newtag, exp_res):
|
||||
from buku import import_md
|
||||
p = tmpdir.mkdir("importmd").join("test.md")
|
||||
p.write("[text1](http://example.com)")
|
||||
res = list(import_md(p.strpath, newtag))
|
||||
assert res[0] == exp_res
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'html_text, exp_res',
|
||||
[
|
||||
(
|
||||
"""<DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
|
||||
<DD>comment for the bookmark here
|
||||
<a> </a>""",
|
||||
((
|
||||
'https://github.com/j', 'GitHub', ',tag1,tag2,',
|
||||
'comment for the bookmark here\n', 0, True
|
||||
),)
|
||||
),
|
||||
(
|
||||
"""DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
|
||||
<DD>comment for the bookmark here
|
||||
<a>second line of the comment here</a>""",
|
||||
((
|
||||
'https://github.com/j', 'GitHub', ',tag1,tag2,',
|
||||
'comment for the bookmark here\n ', 0, True
|
||||
),)
|
||||
),
|
||||
(
|
||||
"""DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
|
||||
<DD>comment for the bookmark here
|
||||
second line of the comment here
|
||||
third line of the comment here
|
||||
<DT><A HREF="https://news.com/" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2,tag3">News</A>""",
|
||||
(
|
||||
(
|
||||
'https://github.com/j', 'GitHub', ',tag1,tag2,',
|
||||
'comment for the bookmark here\n '
|
||||
'second line of the comment here\n '
|
||||
'third line of the comment here\n ',
|
||||
0, True
|
||||
),
|
||||
('https://news.com/', 'News', ',tag1,tag2,tag3,', None, 0, True)
|
||||
)
|
||||
),
|
||||
(
|
||||
|
||||
"""DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
|
||||
<DD>comment for the bookmark here""",
|
||||
((
|
||||
'https://github.com/j', 'GitHub', ',tag1,tag2,',
|
||||
'comment for the bookmark here', 0, True
|
||||
),)
|
||||
)
|
||||
|
||||
]
|
||||
)
|
||||
def test_import_html(html_text, exp_res):
|
||||
"""test method."""
|
||||
from buku import import_html
|
||||
from bs4 import BeautifulSoup
|
||||
html_soup = BeautifulSoup(html_text, 'html.parser')
|
||||
res = list(import_html(html_soup, False, None))
|
||||
for item, exp_item in zip(res, exp_res):
|
||||
assert item == exp_item
|
||||
|
||||
|
||||
def test_import_html_and_add_parent():
|
||||
from buku import import_html
|
||||
from bs4 import BeautifulSoup
|
||||
html_text = """<DT><H3>1s</H3>
|
||||
<DL><p>
|
||||
<DT><A HREF="http://example.com/"></A>"""
|
||||
exp_res = ('http://example.com/', None, ',1s,', None, 0, True)
|
||||
html_soup = BeautifulSoup(html_text, 'html.parser')
|
||||
res = list(import_html(html_soup, True, None))
|
||||
assert res[0] == exp_res
|
||||
|
||||
|
||||
def test_import_html_and_new_tag():
|
||||
from buku import import_html
|
||||
from bs4 import BeautifulSoup
|
||||
html_text = """<DT><A HREF="https://github.com/j" TAGS="tag1,tag2">GitHub</A>
|
||||
<DD>comment for the bookmark here"""
|
||||
exp_res = (
|
||||
'https://github.com/j', 'GitHub', ',tag1,tag2,tag3,',
|
||||
'comment for the bookmark here', 0, True
|
||||
)
|
||||
html_soup = BeautifulSoup(html_text, 'html.parser')
|
||||
res = list(import_html(html_soup, False, 'tag3'))
|
||||
assert res[0] == exp_res
|
||||
|
Loading…
x
Reference in New Issue
Block a user