I want to hash all files in a directory. IO (to external SSD drive) is the bottleneck. How many threads should I use? Does it even matter?
import hashlib
def hash_path(path: str):
h = hashlib.md5()
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
h.update(chunk)
return h.hexdigest()
from pathlib import Path
path = Path('/media/usb/dynamic_docs')
paths = [p for p in path.glob('**/*') if p.is_file()]
paths = paths[:1000]
from multiprocessing.pool import ThreadPool
from tqdm.notebook import tqdm
x = [1,2,3,4,6,8,10,15,20,25,30,40,50,70,80,120,140]
y = []
for n in tqdm(x):
with ThreadPool(n) as pool:
start = timeit.default_timer()
pool.map(hash_path, paths)
end = timeit.default_timer()
duration = end - start
print(n, duration)
y.append(duration)
1 5.8160336340006324 2 3.622440821993223 3 3.1050608700024895 4 2.9156927020012517 6 2.775160079996567 8 2.7484407580050174 10 2.7938896600026055 15 2.8651001560065197 20 2.8214209449943155 25 2.7490319029966486 30 2.8025731050001923 40 2.8451450540014775 50 2.6865460660046665 70 2.8644705260012415 80 2.8269265680064564 120 2.73746527700132 140 2.775528302998282

No comments:
Post a Comment