From 16cad432d74fb987e58383d71b42544d764486cb Mon Sep 17 00:00:00 2001
From: Arun Prakash Jana <engineerarun@gmail.com>
Date: Tue, 18 Jul 2017 10:39:58 +0530
Subject: [PATCH] Ignore some non-generic URLs.

URLs starting with `place:`, `file://` and `apt://` are ignored during import.
---
 README.md |  2 ++
 buku.1    |  8 ++++++--
 buku.py   | 17 +++++++++++++++++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4f3752f..fbcb7a1 100644
--- a/README.md
+++ b/README.md
@@ -268,6 +268,8 @@ SYMBOLS:
   - --sreg : match a regular expression (ignores --deep).
   - --stag : search bookmarks by a tag, or list all tags alphabetically with usage count (if no arguments).
   - Search results are indexed serially. This index is different from actual database index of a bookmark record which is shown within `[]` after the title.
+- **Import**:
+  - URLs starting with `place:`, `file://` and `apt://` are ignored during import.
 - **Encryption** is optional and manual. AES256 algorithm is used. To use encryption, the database file should be unlocked (-k) before using `buku` and locked (-l) afterwards. Between these 2 operations, the database file lies unencrypted on the disk, and NOT in memory. Also, note that the database file is *unencrypted on creation*.
 - **Editor** support:
   - A single bookmark can be edited before adding. The editor can be set using the environment variable *EDITOR* or by explicitly specifying the editor. The latter takes preference. If -a is used along with -w, the details are populated in the editor template.
diff --git a/buku.1 b/buku.1
index 771de09..a099fc1 100644
--- a/buku.1
+++ b/buku.1
@@ -72,15 +72,19 @@ Bookmarks with immutable titles are listed with '(L)' after the title.
   - Search results are indexed serially. This index is different from actual database index of a bookmark record which is shown within '[]' after the title.
 .PP
 .IP 9. 4
-\fBEncryption\fR is optional and manual. AES256 algorithm is used. To use encryption, the database file should be unlocked (-k) before using \fBbuku\fR and locked (-l) afterwards. Between these 2 operations, the database file lies unencrypted on the disk, and NOT in memory. Also, note that the database file is \fBunencrypted on creation\fR.
+\fBImport\fR:
+  - URLs starting with `place:`, `file://` and `apt://` are ignored during import.
 .PP
 .IP 10. 4
+\fBEncryption\fR is optional and manual. AES256 algorithm is used. To use encryption, the database file should be unlocked (-k) before using \fBbuku\fR and locked (-l) afterwards. Between these 2 operations, the database file lies unencrypted on the disk, and NOT in memory. Also, note that the database file is \fBunencrypted on creation\fR.
+.PP
+.IP 11. 4
 \fBEditor\fR support:
   - A single bookmark can be edited before adding. The editor can be set using the environment variable *EDITOR* or by explicitly specifying the editor. The latter takes precedence. If -a is used along with -w, the details are populated in the editor template.
   - In case of edit and update (a single bookmark), the existing record details are fetched from DB and populated in the editor template. The environment variable EDITOR must be set. Note that -u works independently of -w.
   - All lines beginning with "#" will be stripped. Then line 1 will be treated as the URL, line 2 will be the title, line 3 will be comma separated tags, and the rest of the lines will be parsed as descriptions.
 .PP
-.IP 11. 4
+.IP 12. 4
 \fBProxy\fR support: please refer to the \fBENVIRONMENT\fR section.
 .SH GENERAL OPTIONS
 .TP
diff --git a/buku.py b/buku.py
index 8927439..d1e3d5b 100755
--- a/buku.py
+++ b/buku.py
@@ -1626,6 +1626,8 @@ class BukuDb:
                             title = line[title_start_delim + 1:index]
                             # Parse url
                             url = line[index + 2:index + 2 + url_end_delim]
+                            if (is_nongeneric_url(url)):
+                                continue
 
                             self.add_rec(url, title, None, None, 0, True)
 
@@ -1665,6 +1667,9 @@ class BukuDb:
                 # Extract comment from <dd> tag
                 desc = None
                 comment_tag = tag.findNextSibling('dd')
+                if (is_nongeneric_url(tag['href'])):
+                    continue
+
                 if comment_tag:
                     desc = comment_tag.text[0:comment_tag.text.find('\n')]
 
@@ -1889,6 +1894,18 @@ def is_bad_url(url):
     return False
 
 
+def is_nongeneric_url(url):
+    '''Returns true for URLs which are non-http and non-generic'''
+
+    ignored_prefix = ['place:', 'file://', 'apt://']
+
+    for prefix in ignored_prefix:
+        if url.startswith(prefix):
+            return True
+
+    return False
+
+
 def is_ignored_mime(url):
     '''Check if URL links to ignored mime
     Only a 'HEAD' request is made for these URLs