-
Notifications
You must be signed in to change notification settings - Fork 0
/
dedup.py
97 lines (76 loc) · 3.48 KB
/
dedup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from __future__ import print_function
import sys
import os
import hashlib
try: input = raw_input
except NameError: pass
# compatible with python 2 and python 3
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def check_for_duplicates(paths, delete=False, hash=hashlib.sha1):
hashes = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
print("Checking directory: %s" % dirpath)
for filename in filenames:
full_path = os.path.join(dirpath, filename)
hashobj = hash()
for chunk in chunk_reader(open(full_path, 'rb')):
hashobj.update(chunk)
file_id = (hashobj.digest(), os.path.getsize(full_path))
duplicate = hashes.get(file_id, None)
if duplicate:
print("\nDuplicate found:\n [1] %s\n [2] %s" % (full_path, duplicate))
if delete:
path1 = full_path
path2 = duplicate
if os.path.dirname(path1) == os.path.dirname(path2):
time1 = os.path.getmtime(path1)
time2 = os.path.getmtime(path2)
print("Files are in the same directory: deleting newest")
if time1 > time2:
print("Deleting:\n [1] %s" % path1)
try:
os.remove(path1)
except:
print("Could not find file:\n %s\nContinuing..." % path1)
hashes[file_id] = path2
else:
print("Deleting:\n [2] %s" % path2)
try:
os.remove(path2)
except:
print("Could not find file:\n %s\nContinuing..." % path2)
hashes[file_id] = path1
else:
selection = input("Which to delete? [1/2]> ")
if selection == "1":
print("Deleting:\n [1] %s" % path1)
try:
os.remove(path1)
except:
print("Could not find file:\n %s\nContinuing..." % path1)
hashes[file_id] = path2
elif selection == "2":
print("Deleting:\n [2] %s" % path2)
try:
os.remove(path2)
except:
print("Could not find file:\n %s\nContinuing..." % path2)
else:
print("Not deleting either image")
else:
hashes[file_id] = full_path
if sys.argv[1:]:
if sys.argv[1] == "-d":
if sys.argv[2:]:
check_for_duplicates(sys.argv[2:], delete=True)
else:
check_for_duplicates(sys.argv[1:])
else:
print("Please pass the paths to check as parameters to the script")