Added a lot after long time

GLivshits · Aug 17, 2021 · 70d67f6 · 70d67f6
1 parent 329b7cf
commit 70d67f6
Show file tree

Hide file tree

Showing 17 changed files with 299 additions and 191 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,5 @@
 *.xz
 /data
 *.txt
-
+*.jpg
+*.pyc
diff --git a/app.log b/app.log
@@ -25,3 +25,32 @@
 2021-06-07 14:03:40,463 - JSON Query to sanam.nadha/feed/: Could not find "window._sharedData" in html response. [retrying; skip with ^C]
 2021-06-07 14:08:49,508 - JSON Query to ayxshaahsan/feed/: Could not find "window._sharedData" in html response. [retrying; skip with ^C]
 2021-06-07 14:08:51,185 - JSON Query to ayxshaahsan/feed/: Could not find "window._sharedData" in html response. [retrying; skip with ^C]
+2021-06-11 16:11:39,715 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 16:20:57,331 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 16:30:48,235 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 16:35:18,415 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 16:36:57,283 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. [retrying; skip with ^C]
+2021-06-11 16:40:58,191 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. [retrying; skip with ^C]
+2021-06-11 16:42:49,199 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. [retrying; skip with ^C]
+2021-06-11 16:43:35,391 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. [retrying; skip with ^C]
+2021-06-11 16:44:11,935 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. [retrying; skip with ^C]
+2021-06-11 16:47:57,334 - JSON Query to explore/tags/smartphone/: HTTP error code 560. [retrying; skip with ^C]
+2021-06-11 16:51:35,888 - JSON Query to explore/tags/smartphone/: HTTP error code 560. [retrying; skip with ^C]
+2021-06-11 16:53:02,831 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 16:57:54,499 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 17:03:00,439 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 17:07:53,459 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 17:09:32,927 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 17:13:38,571 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 17:21:14,351 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 17:26:02,823 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 17:31:15,435 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-06-11 17:36:19,756 - JSON Query to explore/tags/smartphone/: HTTPSConnectionPool(host='www.instagram.com', port=443): Read timed out. (read timeout=10.0) [retrying; skip with ^C]
+2021-07-02 11:49:02,746 - Error occurred during loading data. Trying to use cache server https://fake-useragent.herokuapp.com/browsers/0.1.11
+Traceback (most recent call last):
+  File "/home/grisha/anaconda3/envs/instaloader/lib/python3.8/site-packages/fake_useragent/utils.py", line 154, in load
+    for item in get_browsers(verify_ssl=verify_ssl):
+  File "/home/grisha/anaconda3/envs/instaloader/lib/python3.8/site-packages/fake_useragent/utils.py", line 99, in get_browsers
+    html = html.split('<table class="w3-table-all notranslate">')[1]
+IndexError: list index out of range
+2021-07-02 12:15:30,399 - justa.wie: --login=USERNAME required.
diff --git a/asd.py b/asd.py
@@ -0,0 +1,9 @@
+import os
+import shutil
+from tqdm import tqdm
+
+path = '/home/grisha/4TB/data'
+for item in tqdm(os.listdir(path)):
+    full_path = os.path.join(path, item)
+    if 'posts' not in os.listdir(full_path):
+        shutil.rmtree(full_path)
diff --git a/download_images.py b/download_images.py
@@ -8,6 +8,12 @@
 import argparse
 from tqdm import tqdm
 
+class JsonExpired(Exception):
+    """Base exception for this script.
+
+    :note: This exception should not be raised directly."""
+    pass
+
 
 def write_raw(resp: Union[bytes, requests.Response], filename: str) -> None:
     # print('FUNCTION EXECUTED:{}'.format(inspect.currentframe().f_code.co_name))
@@ -21,49 +27,63 @@ def write_raw(resp: Union[bytes, requests.Response], filename: str) -> None:
             file.write(resp)
     os.replace(filename + '.temp', filename)
 
-def scrape_pics(path):
-    sess = requests.Session()
-    image_path = os.path.join(path, 'images')
+def scrape_pics(path, save_path, num_threads):
+    profile_name = path.split(os.path.sep)[-1]
+    image_path = os.path.join(save_path, profile_name)
+    posts_path = os.path.join(path, 'posts')
+    if not os.path.exists(posts_path):
+        return
+    jsons = list(map(lambda x: os.path.join(posts_path, x), os.listdir(posts_path)))
+    if os.path.exists(image_path):
+        if len(os.listdir(image_path)) >= len(jsons):
+            return
+    if len(jsons) == 0:
+        return
     os.makedirs(image_path, exist_ok = True)
-    i = 1
-    captions = {}
-    with lzma.open(os.path.join(path, 'all_posts.json.xz'), 'r') as f:
-        content = json.load(f)
-    for item in content:
+    executor = concurrent.futures.ThreadPoolExecutor(max_workers = num_threads)
+    def subprocess(path_to_json, i):
         try:
-            caption = item.get('accessibility_caption', '')
-            resp = sess.get(item['display_url'])
+            with lzma.open(path_to_json, 'r') as f:
+                item = json.load(f)
+            resp = requests.get(item['node']['display_url'], timeout = 100)
             if resp.status_code == 200:
                 filename = '{}.jpg'.format(i)
                 write_raw(resp.content, os.path.join(image_path, filename))
-                captions[filename] = caption
-            i += 1
+            elif (resp.status_code == 403) and (resp.text == 'URL signature expired'):
+                raise JsonExpired
             children = item.get('edge_sidecar_to_children', {}).get('edges', [])
+            k = 0
             for child in children:
-                caption = child.get('accessibility_caption', '')
-                resp = sess.get(child['display_url'])
+                resp = requests.get(child['display_url'])
                 if resp.status_code == 200:
-                    filename = '{}.jpg'.format(i)
+                    filename = '{}_child{}.jpg'.format(i, k)
                     write_raw(resp.content, os.path.join(image_path, filename))
-                    captions[filename] = caption
-                i += 1
-        except:
-            continue
-    with lzma.open(os.path.join(path, 'image_captions.json.xz'), 'w') as f:
-        json.dump(captions, f)
+                    k += 1
+        except KeyboardInterrupt:
+            raise
+        except JsonExpired:
+            os.remove(path_to_json)
+        except Exception as err:
+            pass
+    a = 0
+    jobs = [executor.submit(subprocess, item, i) for i, item in enumerate(jsons)]
+    for _ in concurrent.futures.as_completed(jobs):
+        a += 1
+
+
 
 def main():
     parser = argparse.ArgumentParser(description = 'Args for merging jsons.')
     parser.add_argument('--path', type = str, required = True, help = 'Path to folder with all profiles (each profile as unique folder).')
+    parser.add_argument('--save-path', type = str, required = True, help = 'Path to save images')
     parser.add_argument('--num-workers', type = int, default = None, help = 'How many cores to utilize.')
+    parser.add_argument('--threads-per-worker', type = int, default = 1)
     args = parser.parse_args()
+    os.makedirs(args.save_path, exist_ok = True)
     all_paths = list(map(lambda x: os.path.join(args.path, x), os.listdir(args.path)))
     executor = concurrent.futures.ProcessPoolExecutor(max_workers=args.num_workers)
-    jobs = [executor.submit(scrape_pics, item) for item in all_paths]
-    i = 0
+    jobs = [executor.submit(scrape_pics, item, args.save_path, args.threads_per_worker) for item in all_paths]
     for _ in tqdm(concurrent.futures.as_completed(jobs), total=len(jobs)):
-        i += 1
-
+       pass
 
-if __name__ == '__main__':
-    main()
+main()
diff --git a/download_images_tor.py b/download_images_tor.py
@@ -0,0 +1,67 @@
+import requests
+import lzma
+import json
+from typing import Union
+import os
+import shutil
+import concurrent.futures
+import argparse
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+
+class JsonExpired(Exception):
+    """Base exception for this script.
+
+    :note: This exception should not be raised directly."""
+    pass
+
+
+def write_raw(resp: Union[bytes, requests.Response], filename: str) -> None:
+    # print('FUNCTION EXECUTED:{}'.format(inspect.currentframe().f_code.co_name))
+    """Write raw response data into a file.
+
+    .. versionadded:: 4.2.1"""
+    with open(filename + '.temp', 'wb') as file:
+        if isinstance(resp, requests.Response):
+            shutil.copyfileobj(resp.content, file)
+        else:
+            file.write(resp)
+    os.replace(filename + '.temp', filename)
+
+def scrape_pics(row, save_path):
+    resp = requests.get(row['display_url'], timeout = 100)
+    if resp.status_code == 200:
+        write_raw(resp.content, os.path.join(save_path, '{}_{}.jpg'.format(row['owner.id'], row['id'])))
+
+def main():
+    parser = argparse.ArgumentParser(description = 'Args for merging jsons.')
+    parser.add_argument('--csv_path', type = str, required = True, help = 'Path to folder with all profiles (each profile as unique folder).')
+    parser.add_argument('--save_path', type = str, required = True, help = 'Path to save images')
+    parser.add_argument('--num-workers', type = int, default = None, help = 'How many cores to utilize.')
+    args = parser.parse_args()
+    os.makedirs(args.save_path, exist_ok = True)
+    df = pd.read_csv(args.csv_path, sep = ',')
+    bad_words = ['sale', 'discount', 'offer', 'price', 'ship', 'special', '%', 'shop', 'pcs', 'usd', 'cm', 'kg', 'material', 'stock', '*']
+
+    def remove_bad_posts(df, bad_words):
+        idxs = []
+        for item in bad_words:
+            for idx, row in df.iterrows():
+                if item in row['edge_media_to_caption.edges'].lower():
+                    idxs.append(idx)
+        idxs = np.unique(idxs)
+        df = df.drop(index=idxs)
+        return df
+
+    print('Removing bad words')
+
+    df = remove_bad_posts(df, bad_words)
+    executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers)
+    jobs = [executor.submit(scrape_pics, row, args.save_path) for _, row in df.iterrows()]
+    for _ in tqdm(concurrent.futures.as_completed(jobs), total=len(jobs)):
+       pass
+    #for _, row in tqdm(df.iterrows()):
+     #   scrape_pics(row, args.save_path)
+
+main()
diff --git a/instaloader.py b/instaloader.py
@@ -1,11 +1,20 @@
 #!/usr/bin/env python3
 
 import argparse
+import concurrent.futures
 import time
 
 from instaloader import scrape_followers, scrape_user_data, scrape_posts, scrape_hashtag, scrape_location
 from instaloader.proxyrotator import ProxyRotator
-from instaloader.exceptions import InstaloaderException
+from instaloader.exceptions import InstaloaderException, QueryReturnedNotFoundException, ProfileNotExistsException
+from instaloader.instaloader import Instaloader
+from instaloader.instaloadercontext import default_user_agent
+import pandas as pd
+from tqdm import tqdm
+
+def delete_row(df, row_idx):
+    df = df.drop(row_idx)
+    return df
 
 parser = argparse.ArgumentParser()
 parser.add_argument('profiles', nargs='*',
@@ -34,30 +43,92 @@
 
 
 api_key = args.api_key
-proxy_object = None
-if len(api_key) > 0 and args.use_proxy:
-    proxy_object = ProxyRotator(api_key = api_key, idx = args.proxy_index)
+proxy_objects = []
+if args.use_proxy:
+    if len(api_key) > 0:
+        for i in range(args.proxy_index + 1):
+            proxy_objects.append(ProxyRotator(api_key = api_key, idx = i))
+    else:
+        proxy_objects = [None]
+else:
+    args.proxy_index = 0
+
+loaders = []
+for proxy in proxy_objects:
+    loader = Instaloader(sleep=True, quiet=False, user_agent='{}'.format(default_user_agent()),
+                            dirname_pattern='data/{target}', filename_pattern='{target}_{date_utc}',
+                            download_pictures = False,
+                            download_videos = False,
+                            download_video_thumbnails = False,
+                            download_geotags = False,
+                            download_comments=False,
+                            save_metadata = True,
+                            compress_json = True,
+                            post_metadata_txt_pattern='',
+                            storyitem_metadata_txt_pattern=None,
+                            max_connection_attempts=2,
+                            request_timeout=15.0,
+                            resume_prefix='iterator',
+                            check_resume_bbd=False,
+                            rapidapi_key=None, proxyrotator = proxy)
+    loaders.append(loader)
 
 func_dict = {'scrape_user_data': scrape_user_data, 'scrape_posts': scrape_posts,
              'scrape_followers': scrape_followers, 'scrape_hashtag': scrape_hashtag,
              'scrape_location': scrape_location}
 func = func_dict[args.task]
 
+df = pd.read_csv(args.csv_path, engine='python', sep=';')
+if 'downloaded' not in df.columns:
+    df['downloaded'] = False
+df['id'] = df['id'].astype(str)
+df['username'] = df['username'].astype(str)
+ids = df[['id', 'username']][~df['downloaded']]
+ids['id'] = ids['id'].astype(str)
+ids['username'] = ids['username'].astype(str)
+ids['idx'] = ids.index
+ids = ids.to_dict('records')
+
+
 flag = True
-if __name__ == '__main__':
-    k = 1
-    while flag:
-        print('Current attempt: {}.'.format(k))
-        try:
-            func.main(profiles = args.profiles ,filename = args.csv_path, proxy_object=proxy_object, compress_json = True)
-            flag = False
-        except KeyboardInterrupt:
-            flag = False
-            break
-        except Exception as err:
-            print('Following error occured:\n{}\nDoing sleep for 60 sec, then retry.'.format(str(err)))
-            time.sleep(60)
-            k += 1
-            flag = False
-            break
+executor = concurrent.futures.ProcessPoolExecutor(max_workers=args.proxy_index + 1)
+def subprocess(target, total_index):
+    global df
+    total_index += 1
+    print('Current total index is {}.'.format(total_index))
+    try:
+        func._main(instaloader = loaders[total_index % (args.proxy_index + 1)], target = target)
+        df.loc[target['idx'], 'downloaded'] = True
+        total_index += 1
+        if (total_index % 100 == 0):
+            df.to_csv(args.csv_path, sep = ';', index = None)
+    except (QueryReturnedNotFoundException, ProfileNotExistsException):
+        df = delete_row(df, target['idx'])
+        pass
+    except KeyboardInterrupt:
+        df.to_csv(args.csv_path, sep=';', index=None)
+        raise
+    except Exception as err:
+        print(err)
+
+jobs = [executor.submit(subprocess, target, index) for index, target  in enumerate(ids)]
+for _ in tqdm(concurrent.futures.as_completed(jobs), total = len(jobs)):
+    pass
+
+# if __name__ == '__main__':
+#     k = 1
+#     while flag:
+#         print('Current attempt: {}.'.format(k))
+#         try:
+#             func.main(profiles = args.profiles ,filename = args.csv_path, proxy_object=proxy_object, compress_json = True)
+#             flag = False
+#         except KeyboardInterrupt:
+#             flag = False
+#             break
+#         except Exception as err:
+#             print('Following error occured:\n{}\nDoing sleep for 60 sec, then retry.'.format(str(err)))
+#             time.sleep(60)
+#             k += 1
+#             flag = False
+#             break
 
diff --git a/instaloader/__pycache__/__init__.cpython-38.pyc b/instaloader/__pycache__/__init__.cpython-38.pyc
diff --git a/instaloader/__pycache__/exceptions.cpython-38.pyc b/instaloader/__pycache__/exceptions.cpython-38.pyc
diff --git a/instaloader/__pycache__/instaloader.cpython-38.pyc b/instaloader/__pycache__/instaloader.cpython-38.pyc
diff --git a/instaloader/__pycache__/instaloadercontext.cpython-38.pyc b/instaloader/__pycache__/instaloadercontext.cpython-38.pyc
diff --git a/instaloader/__pycache__/nodeiterator.cpython-38.pyc b/instaloader/__pycache__/nodeiterator.cpython-38.pyc
diff --git a/instaloader/__pycache__/proxyrotator.cpython-38.pyc b/instaloader/__pycache__/proxyrotator.cpython-38.pyc
diff --git a/instaloader/__pycache__/structures.cpython-38.pyc b/instaloader/__pycache__/structures.cpython-38.pyc
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,4 +6,5 @@ @@
     *.xz
     /data
     *.txt
+    *.jpg
+    *.pyc