I want to hash all files in a directory. IO (to external SSD drive) is the bottleneck. How many threads should I use? Does it even matter?
import hashlib def hash_path(path: str): h = hashlib.md5() with open(path, 'rb') as f: for chunk in iter(lambda: f.read(4096), b''): h.update(chunk) return h.hexdigest()
from pathlib import Path path = Path('/media/usb/dynamic_docs') paths = [p for p in path.glob('**/*') if p.is_file()] paths = paths[:1000]
from multiprocessing.pool import ThreadPool from tqdm.notebook import tqdm x = [1,2,3,4,6,8,10,15,20,25,30,40,50,70,80,120,140] y = [] for n in tqdm(x): with ThreadPool(n) as pool: start = timeit.default_timer() pool.map(hash_path, paths) end = timeit.default_timer() duration = end - start print(n, duration) y.append(duration)
1 5.8160336340006324 2 3.622440821993223 3 3.1050608700024895 4 2.9156927020012517 6 2.775160079996567 8 2.7484407580050174 10 2.7938896600026055 15 2.8651001560065197 20 2.8214209449943155 25 2.7490319029966486 30 2.8025731050001923 40 2.8451450540014775 50 2.6865460660046665 70 2.8644705260012415 80 2.8269265680064564 120 2.73746527700132 140 2.775528302998282
No comments:
Post a Comment