Thursday, 18 April 2024

Python: glob is even slower than pure python

Not as expected: glob takes twice the time than a pure python implementation. The latter even gives you more control

Here the glob version

class Crawler:

    def __init__(self):
        self.root = Path('/run/media/tom/external')
        assert self.root.exists(), f'{self.root} does not exist'

    def work(self):
        start = timeit.default_timer()
        list(self.root.glob('**/*'))
        end = timeit.default_timer()
        print(f'Elapsed time: {end - start}')

This outputs:

Elapsed time: 5.392456786998082

A pure python implementation:

class Crawler:

    def __init__(self):
        self.root = Path('/run/media/tom/external')
        assert self.root.exists(), f'{self.root} does not exist'

    def work(self):
        start = timeit.default_timer()
        list(self._scan_files(self.root))
        end = timeit.default_timer()
        print(f'Elapsed time: {end - start}')

    def _scan_files(self, directory, pbar=None) -> Iterator[Path]:
        '''
        Instead of glob('**/*'). They say its faster, but I didn't check
        '''
        queue = deque([directory])  # Start with the root directory
        while queue:
            current_dir = queue.popleft()  # Get one directory from the queue
            with os.scandir(current_dir) as scanner:
                for entry in scanner:
                    if entry.is_dir(follow_symlinks=False):
                        # Add sub-directories to the queue
                        queue.append(entry.path)
                    elif entry.is_file():
                        if pbar is not None:
                            pbar.update(1)
                        yield Path(entry.path)  # Yield each file path

Gives

Elapsed time: 2.4762056120016496

More than twice as fast! I did not expect that at all!

It even gives you more flexibility like with a progress-bar or max-elements (e.g. for prototyping)

class Crawler:

    def __init__(self):
        self.root = Path('/run/media/tom/external')
        assert self.root.exists(), f'{self.root} does not exist'

    def work(self):
        start = timeit.default_timer()
        list(self._scan_files(self.root))
        end = timeit.default_timer()
        print(f'Elapsed time: {end - start}')

    def _scan_files(self, directory, pbar=None, max=None) -> Iterator[Path]:
        '''
        Instead of glob('**/*'). They say its faster, but I didn't check
        '''
        queue = deque([directory])  # Start with the root directory
        count = 0
        while queue:
            current_dir = queue.popleft()  # Get one directory from the queue
            with os.scandir(current_dir) as scanner:
                for entry in scanner:
                    if entry.is_dir(follow_symlinks=False):
                        # Add sub-directories to the queue
                        queue.append(entry.path)
                    elif entry.is_file():
                        if pbar is not None:
                            pbar.update(1)
                        yield Path(entry.path)  # Yield each file path
                        count += 1
                        if max is not None and count >= max:
                            return

No comments:

Post a Comment

Parse Wikipedia dump

""" This module processes Wikipedia dump files by extracting individual articles and parsing them into a structured format, ...