Not as expected: glob takes twice the time than a pure python implementation. The latter even gives you more control
Here the glob version
class Crawler: def __init__(self): self.root = Path('/run/media/tom/external') assert self.root.exists(), f'{self.root} does not exist' def work(self): start = timeit.default_timer() list(self.root.glob('**/*')) end = timeit.default_timer() print(f'Elapsed time: {end - start}')
This outputs:
Elapsed time: 5.392456786998082
A pure python implementation:
class Crawler: def __init__(self): self.root = Path('/run/media/tom/external') assert self.root.exists(), f'{self.root} does not exist' def work(self): start = timeit.default_timer() list(self._scan_files(self.root)) end = timeit.default_timer() print(f'Elapsed time: {end - start}') def _scan_files(self, directory, pbar=None) -> Iterator[Path]: ''' Instead of glob('**/*'). They say its faster, but I didn't check ''' queue = deque([directory]) # Start with the root directory while queue: current_dir = queue.popleft() # Get one directory from the queue with os.scandir(current_dir) as scanner: for entry in scanner: if entry.is_dir(follow_symlinks=False): # Add sub-directories to the queue queue.append(entry.path) elif entry.is_file(): if pbar is not None: pbar.update(1) yield Path(entry.path) # Yield each file path
Gives
Elapsed time: 2.4762056120016496
More than twice as fast! I did not expect that at all!
It even gives you more flexibility like with a progress-bar or max-elements (e.g. for prototyping)
class Crawler: def __init__(self): self.root = Path('/run/media/tom/external') assert self.root.exists(), f'{self.root} does not exist' def work(self): start = timeit.default_timer() list(self._scan_files(self.root)) end = timeit.default_timer() print(f'Elapsed time: {end - start}') def _scan_files(self, directory, pbar=None, max=None) -> Iterator[Path]: ''' Instead of glob('**/*'). They say its faster, but I didn't check ''' queue = deque([directory]) # Start with the root directory count = 0 while queue: current_dir = queue.popleft() # Get one directory from the queue with os.scandir(current_dir) as scanner: for entry in scanner: if entry.is_dir(follow_symlinks=False): # Add sub-directories to the queue queue.append(entry.path) elif entry.is_file(): if pbar is not None: pbar.update(1) yield Path(entry.path) # Yield each file path count += 1 if max is not None and count >= max: return
No comments:
Post a Comment