refactor import db (#176)

* chg: dev: refactor bookmark html * chg: dev: refactor bookmark markdown parser * chg: dev: update refactor * new: test: import_md * fix: dev: newtag parameter on import_md * new: test: import_html * new: test: add bs4 as test package * chg: dev: rename fp to filepath * new: test: import html and add parent as tag * new: test: additional test for import_html * new: test: simple bookmark html * chg: dev: add round bracket !cosmetic * chg: dev: maximalize line length !cosmetic
2017-07-22 11:31:29 +08:00 · 2017-07-22 11:31:29 +08:00 · 50e89f58b7
commit 50e89f58b7
parent a9f4cd502a
3 changed files with 194 additions and 60 deletions
--- a/buku.py
+++ b/buku.py
@ -329,6 +329,91 @@ class BukuCrypt:
            sys.exit(1)


+def import_md(filepath, newtag):
+    '''Parse bookmark markdown file
+
+    :param filepath: Markdown file
+    :param newtag: New tag
+    :return: a tuple containing parsed result
+    '''
+    with open(filepath, mode='r', encoding='utf-8') as infp:
+        for line in infp:
+            # Supported markdown format: [title](url)
+            # Find position of title end, url start delimiter combo
+            index = line.find('](')
+            if index != -1:
+                # Find title start delimiter
+                title_start_delim = line[:index].find('[')
+                # Reverse find the url end delimiter
+                url_end_delim = line[index + 2:].rfind(')')
+
+                if title_start_delim != -1 and url_end_delim > 0:
+                    # Parse title
+                    title = line[title_start_delim + 1:index]
+                    # Parse url
+                    url = line[index + 2:index + 2 + url_end_delim]
+                    if (is_nongeneric_url(url)):
+                        continue
+
+                    yield (
+                        url, title, delim_wrap(newtag)
+                        if newtag else None, None, 0, True
+                    )
+
+
+def import_html(html_soup, add_parent_folder_as_tag, newtag):
+    '''Parse bookmark html
+
+    :param html_soup: HTML soup of bookmark html
+    :param add_parent_folder_as_tag: add parent folder as tag
+    :param newtag: add unique tag
+    :return: a tuple containing parsed result
+    '''
+    # compatibility
+    soup = html_soup
+
+    for tag in soup.findAll('a'):
+        # Extract comment from <dd> tag
+        try:
+            if (is_nongeneric_url(tag['href'])):
+                continue
+        except KeyError as e:
+            continue
+
+        desc = None
+        comment_tag = tag.findNextSibling('dd')
+
+        if comment_tag:
+            desc = comment_tag.find(text=True, recursive=False)
+
+        # add parent folder as tag
+        if add_parent_folder_as_tag:
+            # could be its folder or not
+            possible_folder = tag.find_previous('h3')
+            # get list of tags within that folder
+            tag_list = tag.parent.parent.find_parent('dl')
+
+            if ((possible_folder) and
+                    possible_folder.parent in list(tag_list.parents)):
+                # then it's the folder of this bookmark
+                if tag.has_attr('tags'):
+                    tag['tags'] += (DELIM + possible_folder.text)
+                else:
+                    tag['tags'] = possible_folder.text
+
+        # add unique tag if opted
+        if newtag:
+            if tag.has_attr('tags'):
+                tag['tags'] += (DELIM + newtag)
+            else:
+                tag['tags'] = newtag
+
+        yield (
+            tag['href'], tag.string, parse_tags([tag['tags']])
+            if tag.has_attr('tags') else None, desc, 0, True
+        )
+
+
 class BukuDb:
    '''Abstracts all database operations'''

@ -1617,27 +1702,8 @@ class BukuDb:
            newtag = None

        if filepath.endswith('.md'):
-            with open(filepath, mode='r', encoding='utf-8') as infp:
-                for line in infp:
-                    # Supported markdown format: [title](url)
-                    # Find position of title end, url start delimiter combo
-                    index = line.find('](')
-                    if index != -1:
-                        # Find title start delimiter
-                        title_start_delim = line[:index].find('[')
-                        # Reverse find the url end delimiter
-                        url_end_delim = line[index + 2:].rfind(')')
-
-                        if title_start_delim != -1 and url_end_delim > 0:
-                            # Parse title
-                            title = line[title_start_delim + 1:index]
-                            # Parse url
-                            url = line[index + 2:index + 2 + url_end_delim]
-                            if (is_nongeneric_url(url)):
-                                continue
-
-                            self.add_rec(url, title, delim_wrap(newtag)
-                                         if newtag else None, None, 0, True)
+            for item in import_md(filepath=filepath, newtag=newtag):
+                self.add_rec(*item)

            self.conn.commit()
            infp.close()
@ -1658,44 +1724,9 @@ class BukuDb:
            else:
                resp = 'y'

-            for tag in soup.findAll('a'):
-                # Extract comment from <dd> tag
-                try:
-                    if (is_nongeneric_url(tag['href'])):
-                        continue
-                except KeyError as e:
-                    continue
-
-                desc = None
-                comment_tag = tag.findNextSibling('dd')
-
-                if comment_tag:
-                    desc = comment_tag.find(text=True, recursive=False)
-
-                # add parent folder as tag
-                if resp == 'y':
-                    # could be its folder or not
-                    possible_folder = tag.find_previous('h3')
-                    # get list of tags within that folder
-                    tag_list = tag.parent.parent.find_parent('dl')
-
-                    if ((possible_folder) and
-                            possible_folder.parent in list(tag_list.parents)):
-                        # then it's the folder of this bookmark
-                        if tag.has_attr('tags'):
-                            tag['tags'] += (DELIM + possible_folder.text)
-                        else:
-                            tag['tags'] = possible_folder.text
-
-                # add unique tag if opted
-                if newtag:
-                    if tag.has_attr('tags'):
-                        tag['tags'] += (DELIM + newtag)
-                    else:
-                        tag['tags'] = newtag
-
-                self.add_rec(tag['href'], tag.string, parse_tags([tag['tags']])
-                             if tag.has_attr('tags') else None, desc, 0, True)
+            add_parent_folder_as_tag = (resp == 'y')
+            for item in import_html(soup, add_parent_folder_as_tag, newtag):
+                self.add_rec(*item)

            self.conn.commit()
            infp.close()
--- a/setup.py
+++ b/setup.py
@ -16,7 +16,9 @@ with open('README.md', encoding='utf-8') as f:
    long_description = f.read()

 tests_require = [
-    'pytest-cov', 'pytest-catchlog', 'hypothesis>=3.7.0', 'pytest>=3.1.2', 'py>=1.4.33'],
+    'pytest-cov', 'pytest-catchlog', 'hypothesis>=3.7.0', 'pytest>=3.1.2', 'py>=1.4.33',
+    'beautifulsoup4==4.6.0'
+],

 setup(
    name='buku',
--- a/tests/test_buku.py
+++ b/tests/test_buku.py
@ -482,3 +482,104 @@ def test_is_nongeneric_url(url, exp_res):
    import buku
    res = buku.is_nongeneric_url(url)
    assert res == exp_res
+
+
+@pytest.mark.parametrize(
+    'newtag, exp_res',
+    [
+        (None, ('http://example.com', 'text1', None, None, 0, True)),
+        ('tag1',('http://example.com', 'text1', ',tag1,', None, 0, True)),
+    ]
+)
+def test_import_md(tmpdir, newtag, exp_res):
+    from buku import import_md
+    p = tmpdir.mkdir("importmd").join("test.md")
+    p.write("[text1](http://example.com)")
+    res = list(import_md(p.strpath, newtag))
+    assert res[0] == exp_res
+
+
+@pytest.mark.parametrize(
+    'html_text, exp_res',
+    [
+        (
+            """<DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
+<DD>comment for the bookmark here
+<a> </a>""",
+            ((
+                'https://github.com/j', 'GitHub', ',tag1,tag2,',
+                'comment for the bookmark here\n', 0, True
+            ),)
+        ),
+        (
+            """DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
+            <DD>comment for the bookmark here
+            <a>second line of the comment here</a>""",
+            ((
+                'https://github.com/j', 'GitHub', ',tag1,tag2,',
+                'comment for the bookmark here\n            ', 0, True
+            ),)
+        ),
+        (
+            """DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
+            <DD>comment for the bookmark here
+            second line of the comment here
+            third line of the comment here
+            <DT><A HREF="https://news.com/" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2,tag3">News</A>""",
+            (
+                (
+                    'https://github.com/j', 'GitHub', ',tag1,tag2,',
+                    'comment for the bookmark here\n            '
+                    'second line of the comment here\n            '
+                    'third line of the comment here\n            ',
+                    0, True
+                ),
+                ('https://news.com/', 'News', ',tag1,tag2,tag3,', None, 0, True)
+            )
+        ),
+        (
+
+            """DT><A HREF="https://github.com/j" ADD_DATE="1360951967" PRIVATE="1" TAGS="tag1,tag2">GitHub</A>
+            <DD>comment for the bookmark here""",
+            ((
+                'https://github.com/j', 'GitHub', ',tag1,tag2,',
+                'comment for the bookmark here', 0, True
+            ),)
+        )
+
+    ]
+)
+def test_import_html(html_text, exp_res):
+    """test method."""
+    from buku import import_html
+    from bs4 import BeautifulSoup
+    html_soup = BeautifulSoup(html_text, 'html.parser')
+    res = list(import_html(html_soup, False, None))
+    for item, exp_item in zip(res, exp_res):
+        assert item == exp_item
+
+
+def test_import_html_and_add_parent():
+    from buku import import_html
+    from bs4 import BeautifulSoup
+    html_text = """<DT><H3>1s</H3>
+<DL><p>
+<DT><A HREF="http://example.com/"></A>"""
+    exp_res = ('http://example.com/', None, ',1s,', None, 0, True)
+    html_soup = BeautifulSoup(html_text, 'html.parser')
+    res = list(import_html(html_soup, True, None))
+    assert res[0] == exp_res
+
+
+def test_import_html_and_new_tag():
+    from buku import import_html
+    from bs4 import BeautifulSoup
+    html_text = """<DT><A HREF="https://github.com/j" TAGS="tag1,tag2">GitHub</A>
+<DD>comment for the bookmark here"""
+    exp_res = (
+        'https://github.com/j', 'GitHub', ',tag1,tag2,tag3,',
+        'comment for the bookmark here', 0, True
+    )
+    html_soup = BeautifulSoup(html_text, 'html.parser')
+    res = list(import_html(html_soup, False, 'tag3'))
+    assert res[0] == exp_res