Skip to content

Commit

Permalink
tumblr_backup: --json-dirs
Browse files Browse the repository at this point in the history
  • Loading branch information
cebtenzzre committed Sep 29, 2020
1 parent bf85ce9 commit 90a7af8
Showing 1 changed file with 78 additions and 14 deletions.
92 changes: 78 additions & 14 deletions tumblr_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@
except ImportError:
pyjq = None

try:
from os import scandir # type: ignore[attr-defined]
except ImportError:
try:
from scandir import scandir # type: ignore[no-redef]
except ImportError:
scandir = None # type: ignore[assignment,no-redef]

# These builtins have new names in Python 3
try:
long, xrange # type: ignore[has-type]
Expand Down Expand Up @@ -164,6 +172,7 @@ def tb_urlopen(url):

disable_note_scraper = set() # type: Set[str]
disablens_lock = threading.Lock()
old_json_dir_entries = None # type: Optional[Tuple[str, ...]]


class Logger(object):
Expand Down Expand Up @@ -275,7 +284,37 @@ def mktime(tml):
options.p_stop = mktime(tm)


def initial_apiparse(base, old_json_dir):
global old_json_dir_entries
if old_json_dir is not None:
old_json_dir_entries = tuple(e.path for e in sorted(
(e for e in scandir(old_json_dir) if (e.name.endswith('.json') and e.is_file())),
key=lambda e: e.name))
else:
old_json_dir_entries = None

return apiparse(base, 1)


def apiparse(base, count, start=0):
# type: (...) -> Optional[JSONDict]
if old_json_dir_entries is not None:
# Reconstruct the API response
posts = []
posts_respfiles = old_json_dir_entries[start:]
for prf in posts_respfiles[:count]:
with io.open(prf, encoding=FILE_ENCODING) as f:
try:
post = json.load(f)
except ValueError as e:
f.seek(0)
log('{}: {}\n{!r}\n'.format(e.__class__.__name__, e, f.read()))
return None
posts.append(post)
return {'posts': posts,
'post_respfiles': posts_respfiles,
'blog': dict(posts[0]['blog'] if posts else {}, posts=len(old_json_dir_entries))}

params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'}
if start > 0:
params['offset'] = start
Expand Down Expand Up @@ -577,7 +616,7 @@ def footer(base, previous_page, next_page, suffix):
f += '</nav></footer>\n'
return f

def backup(self, account):
def backup(self, account, old_json_dir):
"""makes single files and an index for every post on a public Tumblr blog account"""

base = get_api_url(account)
Expand Down Expand Up @@ -618,8 +657,7 @@ def backup(self, account):
else:
log.status('Getting basic information\r')

# start by calling the API with just a single post
resp = apiparse(base, 1)
resp = initial_apiparse(base, old_json_dir)
if not resp:
self.errors = True
return
Expand All @@ -644,9 +682,11 @@ def backup(self, account):
backup_pool = ThreadPool()

# returns whether any posts from this batch were saved
def _backup(posts):
for p in sorted(posts, key=lambda x: x['id'], reverse=True):
post = post_class(p, account)
def _backup(posts, post_respfiles):
sorted_posts = sorted(zip(posts, post_respfiles),
key=lambda x: x[0]['id'], reverse=True)
for p, prf in sorted_posts:
post = post_class(p, account, prf)
if ident_max and long(post.ident) <= ident_max:
return False
if options.count and self.post_count >= options.count:
Expand Down Expand Up @@ -697,8 +737,12 @@ def _backup(posts):
continue

posts = resp[posts_key]
post_respfiles = resp.get('post_respfiles')
if post_respfiles is None:
post_respfiles = [None for _ in posts]

# `_backup(posts)` can be empty even when `posts` is not if we don't backup reblogged posts
if not posts or not _backup(posts):
if not posts or not _backup(posts, post_respfiles):
log.status('Backing up posts found empty set of posts, finishing\r')
break

Expand Down Expand Up @@ -735,12 +779,12 @@ def _backup(posts):
class TumblrPost(object):
post_header = '' # set by TumblrBackup.backup()

def __init__(self, post, backup_account):
# type: (JSONDict, str) -> None
def __init__(self, post, backup_account, respfile):
# type: (JSONDict, str, Text) -> None
self.content = ''
self.post = post
self.backup_account = backup_account
self.json_content = to_unicode(json.dumps(post, sort_keys=True, indent=4, separators=(',', ': ')))
self.respfile = respfile
self.creator = post['blog_name']
self.ident = str(post['id'])
self.url = post['post_url']
Expand Down Expand Up @@ -883,7 +927,7 @@ def make_player(src_):

else:
log(u"Unknown post type '{}' in post #{}\n".format(self.typ, self.ident))
append(escape(self.json_content), u'<pre>%s</pre>')
append(escape(self.get_json_content()), u'<pre>%s</pre>')

self.content = '\n'.join(content)

Expand Down Expand Up @@ -1147,7 +1191,13 @@ def save_post(self):
os.utime(f.name, (self.date, self.date))
if options.json:
with open_text(json_dir, self.ident + '.json') as f:
f.write(self.json_content)
f.write(self.get_json_content())

def get_json_content(self):
if self.respfile is not None:
with io.open(self.respfile, encoding=FILE_ENCODING) as f:
return f.read()
return to_unicode(json.dumps(self.post, sort_keys=True, indent=4, separators=(',', ': ')))

@staticmethod
def _parse_url_match(match, transform=None):
Expand Down Expand Up @@ -1275,6 +1325,10 @@ class CSVCallback(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, set(values.split(',')))

class CSVListCallback(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, list(values.split(',')))

class RequestCallback(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
request = getattr(namespace, self.dest) or {}
Expand Down Expand Up @@ -1343,6 +1397,8 @@ def __call__(self, parser, namespace, values, option_string=None):
help='add EXIF keyword tags to each picture'
" (comma-separated values; '-' to remove all tags, '' to add no extra tags)")
parser.add_argument('-S', '--no-ssl-verify', action='store_true', help='ignore SSL verification errors')
parser.add_argument('--json-dirs', action=CSVListCallback, default=[], metavar='DIRS',
help='comma-separated list of directories containing API responses (one per blog)')
parser.add_argument('blogs', nargs='*')
options = parser.parse_args()

Expand Down Expand Up @@ -1397,13 +1453,21 @@ def __call__(self, parser, namespace, values, option_string=None):
if pyjq is None:
parser.error("--filter: module 'pyjq' is not installed")
options.filter = pyjq.compile(options.filter)
if options.json_dirs:
if not scandir:
parser.error("--json-dirs: Python is less than 3.5 and module 'scandir' is not installed")
if len(options.json_dirs) != len(blogs):
parser.error('--json-dirs: expected {} directories, got {}'.format(len(blogs), len(options.json_dirs)))
for d in options.json_dirs:
if not os.access(d, os.R_OK | os.X_OK):
parser.error("--json-dirs: directory '{}' cannot be read".format(d))

global backup_account
tb = TumblrBackup()
try:
for account in blogs:
for i, account in enumerate(blogs):
log.backup_account = account
tb.backup(account)
tb.backup(account, options.json_dirs[i] if options.json_dirs else None)
except KeyboardInterrupt:
sys.exit(EXIT_INTERRUPT)

Expand Down

0 comments on commit 90a7af8

Please sign in to comment.