Not as expected: glob takes twice the time than a pure python implementation. The latter even gives you more control
Here the glob version
class Crawler:
def __init__(self):
self.root = Path('/run/media/tom/external')
assert self.root.exists(), f'{self.root} does not exist'
def work(self):
start = timeit.default_timer()
list(self.root.glob('**/*'))
end = timeit.default_timer()
print(f'Elapsed time: {end - start}')
This outputs:
Elapsed time: 5.392456786998082
A pure python implementation:
class Crawler:
def __init__(self):
self.root = Path('/run/media/tom/external')
assert self.root.exists(), f'{self.root} does not exist'
def work(self):
start = timeit.default_timer()
list(self._scan_files(self.root))
end = timeit.default_timer()
print(f'Elapsed time: {end - start}')
def _scan_files(self, directory, pbar=None) -> Iterator[Path]:
'''
Instead of glob('**/*'). They say its faster, but I didn't check
'''
queue = deque([directory]) # Start with the root directory
while queue:
current_dir = queue.popleft() # Get one directory from the queue
with os.scandir(current_dir) as scanner:
for entry in scanner:
if entry.is_dir(follow_symlinks=False):
# Add sub-directories to the queue
queue.append(entry.path)
elif entry.is_file():
if pbar is not None:
pbar.update(1)
yield Path(entry.path) # Yield each file path
Gives
Elapsed time: 2.4762056120016496
More than twice as fast! I did not expect that at all!
It even gives you more flexibility like with a progress-bar or max-elements (e.g. for prototyping)
class Crawler:
def __init__(self):
self.root = Path('/run/media/tom/external')
assert self.root.exists(), f'{self.root} does not exist'
def work(self):
start = timeit.default_timer()
list(self._scan_files(self.root))
end = timeit.default_timer()
print(f'Elapsed time: {end - start}')
def _scan_files(self, directory, pbar=None, max=None) -> Iterator[Path]:
'''
Instead of glob('**/*'). They say its faster, but I didn't check
'''
queue = deque([directory]) # Start with the root directory
count = 0
while queue:
current_dir = queue.popleft() # Get one directory from the queue
with os.scandir(current_dir) as scanner:
for entry in scanner:
if entry.is_dir(follow_symlinks=False):
# Add sub-directories to the queue
queue.append(entry.path)
elif entry.is_file():
if pbar is not None:
pbar.update(1)
yield Path(entry.path) # Yield each file path
count += 1
if max is not None and count >= max:
return
No comments:
Post a Comment