Sunday, 16 June 2024

Create Large files in python

Useful for testing. Creates a text file.

import argparse
import re 
from pathlib import Path
import os
import math

def create_file_with_size(file_size_str: str, file_name: Path):
    """
    Create a file with the specified size filled with sequential zero-padded numbers.

    Args:
        file_size_str (str): The desired size of the file, e.g., '12mb', '3gb'.
        file_name (Path): The path (including filename) where the file should be created.
    
    Raises:
        ValueError: If the size unit is not recognized or the format is invalid.
    """

    # Define size units in bytes for to convert the unit in file_size_str to bytes
    size_units = {"b": 1, "kb": 1024, "mb": 1024**2, "gb": 1024**3, "ram": 0}

    # extract the size (float) and unit from the size string
    match = re.match(r"(\d+(\.\d+)?)\s*([a-zA-Z]+)", file_size_str.strip())
    
    # error and abort if invalid if file_size_str is invalid
    if not match:
        print("Invalid size format. Please use a format like '12mb' or '3 gb'.")
        return

    # Extract the numeric value and unit from the regex match groups
    groups = match.groups()
    size = groups[0]
    unit = groups[-1] # there is a middle group with as well (1.5, .5, gb). Take only first and last

    # Finally, get to the bytes in the next few steps

    size = float(size)
    unit = unit.lower()  # Convert unit to lowercase for consistency

    # Check if the provided unit is valid
    if unit not in size_units:
        print("Invalid size unit. Please use 'kb', 'mb', 'gb' or 'ram'.")
        return

    # Lazily calculate the size in bytes if 'ram' is specified as the unit
    if unit == "ram":
        size_units["ram"] = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')  # e.g., 4015976448 bytes

    # Calculate the total file size in bytes using the size and unit
    file_size = math.ceil(size * size_units[unit])
    # Ensure the parent directory of the target file exists
    file_name.parent.mkdir(parents=True, exist_ok=True)

    current_size = 0  # Track the current size of the file being written
    number = 0  # Start number sequence from 0
    
    # Open the file in write mode
    with open(file_name, 'w') as f:
        # Write numbers sequentially until the desired file size is reached
        while current_size < file_size:
            number_str = str(number).zfill(20)  # Convert number to string and pad with zeros to 20 digits
            f.write(number_str + '\n')  # Write the padded number to the file with a newline
            current_size += len(number_str) + 1  # Update the current size (+1 for the newline character)
            number += 1  # Increment the number
            # Reset number if it exceeds a large value (practically won't happen)
            if number == 10e+20:
                number = 0

    # Print a success message indicating the file creation
    print(f"File '{file_name}' of size {file_size_str.strip()} created successfully.")

def main():
    """
    Main function to parse command line arguments and create a file with the specified size.
    """
    # Create an argument parser for command-line options
    parser = argparse.ArgumentParser(description="Create a file of a specified size.")
    # Add 'size' argument to specify the desired file size
    parser.add_argument("size", type=str, help="Size of the file to create (e.g., '12mb', '3 gb', or '1.3 ram' for 1.3xsize of RAM).")
    # Add 'destination' argument to specify the target filename
    parser.add_argument("destination", type=str, help="Filename of the target file")
    
    # Parse the command-line arguments
    args = parser.parse_args()
    # Call the function to create the file with the parsed size and destination
    create_file_with_size(args.size, Path(args.destination))

# Entry point of the script, call the main function if executed as a script
if __name__ == "__main__":
    main()


          

Parse Wikipedia dump

""" This module processes Wikipedia dump files by extracting individual articles and parsing them into a structured format, ...