From c6f763afd66b71b36c7023b9fcb4f0a4fbb48030 Mon Sep 17 00:00:00 2001
From: Koubae <58447627+Koubae@users.noreply.github.com>
Date: Fri, 11 Dec 2020 07:11:52 +0100
Subject: [PATCH] Added urllib-downloader koubae

Signed-off-by: Koubae <58447627+Koubae@users.noreply.github.com>
---
 .gitignore                             |   4 +-
 urllib_downloader/Example/multiproc.py |  42 +++++
 urllib_downloader/Example/simple.py    |  18 ++
 urllib_downloader/README.MD            |  48 +++++
 urllib_downloader/Tests/test.py        |  66 +++++++
 urllib_downloader/__init__.py          |   0
 urllib_downloader/download_urllib.py   | 233 +++++++++++++++++++++++++
 urllib_downloader/requirements.txt     |   1 +
 8 files changed, 411 insertions(+), 1 deletion(-)
 create mode 100644 urllib_downloader/Example/multiproc.py
 create mode 100644 urllib_downloader/Example/simple.py
 create mode 100644 urllib_downloader/README.MD
 create mode 100644 urllib_downloader/Tests/test.py
 create mode 100644 urllib_downloader/__init__.py
 create mode 100644 urllib_downloader/download_urllib.py
 create mode 100644 urllib_downloader/requirements.txt

diff --git a/.gitignore b/.gitignore
index 5a292ce..4518c14 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 *.log
 *.json
 data/
-logs/
\ No newline at end of file
+logs/
+__pycache__
+.idea
\ No newline at end of file
diff --git a/urllib_downloader/Example/multiproc.py b/urllib_downloader/Example/multiproc.py
new file mode 100644
index 0000000..024a735
--- /dev/null
+++ b/urllib_downloader/Example/multiproc.py
@@ -0,0 +1,42 @@
+import os,sys,inspect
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+sys.path.insert(0,parentdir)
+from download_urllib import download_manager
+
+
+main_keywords = ['neutral', 'angry', 'surprise', 'disgust', 'fear', 'happy', 'sad']
+
+supplemented_keywords = ['facial expression',
+                         'human face',
+                         'face',
+                         'old face',
+                         'young face',
+                         'adult face',
+                         'child face',
+                         'woman face',
+                         'man face',
+                         'male face',
+                         'female face',
+                         'gentleman face',
+                         'lady face',
+                         'boy face',
+                         'girl face',
+                         'American face',
+                         'Chinese face',
+                         'Korean face',
+                         'Japanese face',
+                         'actor face',
+                         'actress face'
+                         'doctor face',
+                         'movie face'
+                         ]
+
+if __name__ == '__main__':
+    # Multiprocess
+    keywords = ['Car', 'Motorbike']
+    extra__words = ['red', 'blue', 'green', 'expensive']
+    download_manager(keywords, extra_keywords=extra__words, total=3, multiprocess=True)
+    download_manager(main_keywords, extra_keywords=supplemented_keywords, total=10,
+                     multiprocess=True, debug=True, download_dir='.new_dir/')
+
diff --git a/urllib_downloader/Example/simple.py b/urllib_downloader/Example/simple.py
new file mode 100644
index 0000000..fd58969
--- /dev/null
+++ b/urllib_downloader/Example/simple.py
@@ -0,0 +1,18 @@
+import os,sys,inspect
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+sys.path.insert(0,parentdir)
+from download_urllib import download_manager
+
+
+# Simple Download
+main_keywords = ['Pizza']
+download_manager(main_keywords)
+
+
+main_keywords = ['Car', 'Motorbike']
+download_manager(main_keywords, total=3)
+
+main_keywords = ['Car', 'Motorbike']
+extra_words = ['red', 'blue', 'green', 'expensive']
+download_manager(main_keywords, extra_keywords=extra_words, total=3)
\ No newline at end of file
diff --git a/urllib_downloader/README.MD b/urllib_downloader/README.MD
new file mode 100644
index 0000000..6ae255d
--- /dev/null
+++ b/urllib_downloader/README.MD
@@ -0,0 +1,48 @@
+Picture Downloader 
+=======================
+
+
+Simple Picture Downloader using Google search and urllib.request
+
+@Author: [lc](https://github.com/WuLC)
+@Last Modified by: [Koubae](https://github.com/Koubae)
+
+-----------------------------------------------------------------------------------------------------
+
+
+REQ REQUIREMENTS:
+-------------------
+
+
+1. **http.client — HTTP protocol client** [DOCS](https://docs.python.org/3/library/http.client.html)
+ - [Source Code](https://github.com/python/cpython/blob/3.9/Lib/http/client.py)
+This module defines classes which implement the client side of the HTTP and 
+HTTPS protocols. It is normally not used directly — the module urllib.request
+
+2. **urllib.request — Extensible library for opening URLs** [DOCS](https://docs.python.org/3/library/urllib.request.html#module-urllib.request)
+
+- [Source Code](https://github.com/python/cpython/blob/3.9/Lib/urllib/request.py)
+
+3. **urllib.parse — Parse URLs into components** [DOCS](https://docs.python.org/3/library/urllib.parse.html#module-urllib.parse)
+
+4. **Regular expression operations re** DOCS](https://docs.python.org/3/library/re.html)
+
+- Google for Education [Python Regular Expressions](https://developers.google.com/edu/python/regular-expressions)
+
+5. **os** [DOCS](https://docs.python.org/3/library/os.html)
+
+6. **logging — Logging facility for Python¶** [DOCS](https://docs.python.org/3/library/logging.html)
+
+7. **multiprocessing — Process-based parallelism** [DOCS](https://docs.python.org/3/library/multiprocessing.html)
+
+### Third party package
+
+1. **user_agent** This module is for generating random, valid web user agents:
+
+- [Source Code](https://github.com/lorien/user_agent)  *install_requires=['six']*
+
+
+
+
+
+
diff --git a/urllib_downloader/Tests/test.py b/urllib_downloader/Tests/test.py
new file mode 100644
index 0000000..a1ce359
--- /dev/null
+++ b/urllib_downloader/Tests/test.py
@@ -0,0 +1,66 @@
+import os,sys,inspect
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+sys.path.insert(0,parentdir)
+from download_urllib import download_manager
+import unittest
+
+# ..urllib_downloader.donwload_urllib
+class Test(unittest.TestCase):
+
+    def test_one_keyword(self):
+        main_keyword = ['neutral']
+        download_manager(main_keyword, extra_keywords=None, total=2)
+
+    def test_download_false(self):
+        main_keyword = ['neutral']
+        download_manager(main_keyword, extra_keywords=None, total=2, download=False)
+
+    def test_custom_dir(self):
+        main_keywords = ['pizza', 'pasta']
+        extra_k = ['pomodoro', 'salami', 'tuna']
+        my_dir = './my_dir/'
+        download_manager(main_keywords, extra_keywords=extra_k, download_dir=my_dir, total=2)
+
+    def test_multiprocess(self):
+        main_keywords = ['neutral', 'angry']
+        supplemented_keywords = ['facial expression', 'people', 'covid', 'world']
+        download_manager(main_keywords, extra_keywords=supplemented_keywords, total=10, multiprocess=True, debug=True)
+
+        main_keywords = ['neutral', 'angry', 'surprise', 'disgust', 'fear', 'happy', 'sad']
+
+        supplemented_keywords = ['facial expression', \
+                                 'human face', \
+                                 'face', \
+                                 'old face', \
+                                 'young face', \
+                                 'adult face', \
+                                 'child face', \
+                                 'woman face', \
+                                 'man face', \
+                                 'male face', \
+                                 'female face', \
+                                 'gentleman face', \
+                                 'lady face', \
+                                 'boy face', \
+                                 'girl face', \
+                                 'American face', \
+                                 'Chinese face', \
+                                 'Korean face', \
+                                 'Japanese face', \
+                                 'actor face', \
+                                 'actress face' \
+                                 'doctor face', \
+                                 'movie face'
+                                 ]
+        download_manager(main_keywords, extra_keywords=supplemented_keywords, total='all', multiprocess=True, debug=True)
+
+
+def run_tests(test_class):
+    suite = unittest.TestLoader().loadTestsFromTestCase(test_class)
+    runner = unittest.TextTestRunner(verbosity=0)
+    result = runner.run(suite)
+
+
+if __name__ == '__main__':
+    run_tests(Test)
diff --git a/urllib_downloader/__init__.py b/urllib_downloader/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/urllib_downloader/download_urllib.py b/urllib_downloader/download_urllib.py
new file mode 100644
index 0000000..6881ffa
--- /dev/null
+++ b/urllib_downloader/download_urllib.py
@@ -0,0 +1,233 @@
+import os
+import re
+import random
+import logging
+import urllib.request
+import urllib.error
+from urllib.parse import quote
+from multiprocessing import Pool, current_process, log_to_stderr
+from user_agent import generate_user_agent
+
+# @Author: lc
+# @Date:   2017-09-25 23:54:24
+# @Last Modified by: Koubae
+# @Last Modified time: 2020-12-10 16:34:22
+
+
+####################################################################################################################
+# Download images from google with specified keywords for searching
+# search query is created by "main_keyword + supplemented_keyword"
+# if there are multiple keywords, each main_keyword will join with each supplemented_keyword
+# mainly use urllib, and each search query will download at most 100 images due to page source code limited by google
+# allow single process or multiple processes for downloading
+####################################################################################################################
+
+#  TODO Add CLI command Line. with argparse
+
+log_file = 'trace.log'
+logging.basicConfig(level=logging.DEBUG, filename=log_file, filemode="a+",
+                    format="%(asctime)-15s %(levelname)-8s  %(message)s")
+
+URL_ROOT = 'https://www.google.com/search?q='
+URL_END = '&source=lnms&tbm=isch'
+
+
+def logger(msg, level):
+    """Helper function | prints and outputs into a log the download_images that functions process"""
+    print(msg, flush=True)
+    if level == 'info':
+        logging.info(msg)
+    elif level == 'warning':
+        logging.warning(msg)
+    elif level == 'error':
+        logging.error(msg)
+
+
+def downloader(url, process, search=False):
+    """Download raw content of the Whole Google Image WebPage or of a URL, depending where the function is called
+    Args:
+        url (str): url of the page or of a process pic
+        process (str): Current Process, used in Multiprocessing, returned from current_process().name
+        search (bool): Search Flag, if True it has to download the entire page, if False url is a picture
+    Returns:
+        raw content of the Webpage or pic's URL
+    """
+
+    headers = dict()
+    headers['User-Agent'] = generate_user_agent()
+    if search:
+        headers['Referer'] = 'https://www.google.com'
+    try:
+        req = urllib.request.Request(url, headers=headers)
+        resp = urllib.request.urlopen(req)
+        return resp.read()
+    except urllib.error.HTTPError as e:
+        err =  f'HTTPError while downloading image {url}\nhttp code { e.code}, reason:{e.reason}, process:{process}'
+        logger(err, 'warning')
+    except urllib.error.URLError as e:
+        err = f'URLError while downloading image {url}\nreason:{e.reason}, process:{process}'
+        logger(err, 'warning')
+    except Exception as e:
+        if search:
+            err = f'error while downloading page {url} during process:{process}'
+            logger(err, 'error')
+        else:
+            err = f'Unexpected error while downloading {url}\nerror type:{type(e)}, args:{e.args}, process:{process}'
+            logger(err, 'error')
+
+
+def sniff_page(search_url, process):
+    """
+    Search Whole Google Image WebPage and scans for patterns of src= html attr.
+    Args:
+        search_url (str): Composed url string pre-compiled to search in Google
+        process (str): Current Process, used in Multiprocessing, returned from current_process().name
+    Returns:
+         set of links if found else empty set
+    """
+    page_content = downloader(search_url, process=process, search=True)
+    page_content = str(page_content)
+    if page_content:
+        link_list = re.findall('src="(.*?)"', page_content)
+        if len(link_list) == 0:
+            msg = f'Found 0 links from page {search_url}'
+            logger(msg, 'warning')
+            return set()
+        else:
+            return set(link_list)
+    else:
+        return set()
+
+
+def gen_dir(download_dir, main_keyword):
+    """Helper function | generates a directory where pics will be downloaded, default dir is ./data/"""
+    if not download_dir:
+        download_dir = './data/'
+    img_dir = download_dir + main_keyword + '/'
+    if not os.path.exists(img_dir):
+        os.makedirs(img_dir)
+    return img_dir
+
+
+def gen_name(count, img_dir):
+    """Helper function | Generate (hopefully) unique IDs for any pictures & prevents name collisions"""
+    # FAQ if the current picture has the same name os a previously downloaded pic it'll overwrite the old one.
+    id_ = str(hex(random.randrange(1000)))
+    file_name = id_ + f'_{count}.jpg'
+    file_path = img_dir + file_name
+    return file_name, file_path
+
+
+def extract_links(main_keyword, extra_keywords, process):
+    """
+    Helper function | Feeds the sniff_page function with URLs pre-compiled to be search in Google
+    Args:
+        main_keyword (str, list): Main Keyword of the search
+        extra_keywords (str, list, None): Extra Keywords that are stuck in the search URL like -> %20[extra_keyword]&...
+        process (str): Current Process, used in Multiprocessing, returned from current_process().name
+    Returns:
+        A unique set of Links, if urls aren't found,  return None
+    """
+    image_links = set()
+    if extra_keywords:
+        for j in range(len(extra_keywords)):
+            msg = f'Process {process} supplemented keyword: {extra_keywords[j]}'
+            logger(msg, 'info')
+            search_url = URL_ROOT + quote(main_keyword + ' ' + extra_keywords[j]) + URL_END
+            print(search_url)
+            image_links = image_links.union(sniff_page(search_url, process))
+    else:
+        msg = f'Process {process} Keyword: {main_keyword}'
+        logger(msg, 'info')
+        search_url = URL_ROOT + quote(main_keyword) + URL_END
+        image_links = image_links.union(sniff_page(search_url, process))
+    msg = f'Process {process} get {len(image_links)} links so far'
+    logger(msg, 'info')
+    return image_links
+
+
+def download_images(main_keyword, extra_keywords=None, download_dir=None, total=None, download=True):
+    """download images with one main keyword and multiple supplemented keywords
+    Args:
+        main_keyword (str): main keyword
+        extra_keywords (list[str], optional): list of supplemented keywords
+        download_dir (str, optional): string with ending /, defines the pictures' s download root directory
+        total (int, str): Number of picture to be downloaded | default 10 | if total == all it will download all urls
+        download (bool): If False, picture won't be downloaded | default True
+    Returns:
+        None
+    """
+
+    process = current_process().name
+    msg = f'Process {process} Main keyword: {main_keyword}'
+    logger(msg, 'info')
+
+    img_dir = gen_dir(download_dir, main_keyword)
+    image_links = extract_links(main_keyword, extra_keywords, process)
+
+    msg = f"Process {process} get totally {len(image_links)} links"
+    logger(msg, 'info')
+
+    if not download:
+        msg = "==="*15 + " < " + f"Process {process} Terminated" + " > " + "==="*15
+        logger(msg, 'info')
+        return
+
+    if isinstance(total, str) and str(total.lower()) == 'all':
+        total = len(image_links)
+    elif not total:
+        total = 10
+
+    msg = "==="*15 + " < " + "Start downloading" + " > " + "==="*15
+    logger(msg, 'info')
+
+    count = 1  # NOTE: Only used to generate the name of the picture, is quite redundant as we generate a random id.
+    limit = 0
+    errors = 0
+    for link in image_links:
+        if limit == total:
+            msg = "==="*15 + " < " + f"Process Terminated total {errors} errors" + " > " + "==="*15
+            logger(msg, 'info')
+            break
+        else:
+            data = downloader(link, process=process)
+            if data:
+                file_name, file_path = gen_name(count, img_dir)
+                with open(file_path, 'wb') as wf:
+                    wf.write(data)
+                msg = f'Process {process} downloaded image {main_keyword}/{file_name}'
+                logger(msg, 'info')
+                count += 1
+                limit += 1
+            else:
+                errors += 1
+                continue
+
+
+def download_manager(main_keywords, extra_keywords=None, download_dir=None,
+                     total=None, download=True, multiprocess=False, debug=False):
+    """Delegator function |
+    Takes care to call download_images for each main_keywords | Args are the same as download_images function"""
+
+    if multiprocess:
+
+        if debug:
+            log_to_stderr(logging.DEBUG)
+        p = Pool()
+        for i in range(len(main_keywords)):
+            p.apply_async(download_images, args=(main_keywords[i], extra_keywords, download_dir, total, download))
+        p.close()
+        p.join()
+    else:
+        for n in range(len(main_keywords)):
+            download_images(main_keywords[n], extra_keywords=extra_keywords,
+                            download_dir=download_dir, total=total, download=download)
+
+
+if __name__ == '__main__':
+
+    main_keywords = ['Pizza']
+    supplemented_keywords = ['tomato', 'basil']
+    download_manager(main_keywords, extra_keywords=supplemented_keywords, total='all', multiprocess=True, debug=True, download=True)
+
+
diff --git a/urllib_downloader/requirements.txt b/urllib_downloader/requirements.txt
new file mode 100644
index 0000000..006cbf1
--- /dev/null
+++ b/urllib_downloader/requirements.txt
@@ -0,0 +1 @@
+user-agent==0.1.9