Tuesday, 20 February 2024

Download a Wikipedia Category

import wikipediaapi
import logging 
import json
from pathlib import Path
from slugify import slugify
import click

logger = logging.getLogger(__name__)

wiki_wiki = wikipediaapi.Wikipedia('parasort (tom-010@web.de)', 'en')

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

@click.command()
@click.argument('category', type=str)
@click.option('--out', type=click.Path(path_type=Path), default='out', help='Output directory')
def main(category:str, out:Path):
    download_wikipedia_category(category, out)


def save_page(page, path:Path, breadcumbs: list[str]):
    """Save the content of a Wikipedia page into a file."""

    logger.info(f'Saving page: {page.title}')

    path.mkdir(parents=True, exist_ok=True)
    target = path / f'{slugify(page.title)}.json'
    if target.exists():
        logger.info(f'Page already exists: {page.title}')
        return

    page_json = {
        'title': page.title,
        'summary': page.summary,
        'pageid': page.pageid,
        'breadcumbs': breadcumbs,
        'sections': []
    }
    for section in page.sections:
        page_json['sections'].append({
            'level': section.level,
            'title': section.title,
            'text': section.full_text()
        })
    
    content = page.text
    if content:
        with target.open('w', encoding='utf-8') as f:
            json.dump(page_json, f, ensure_ascii=False, indent=2)

def process_category(category, path:Path, breadcumbs: list[str] = None):
    """Process each category and its subcategories."""
    if breadcumbs is None:
        breadcumbs = []

    logger.info(f'Processing category: {category}')

    for c in category.categorymembers.values():
        if c.ns == wikipediaapi.Namespace.CATEGORY:
            # Create a directory for the subcategory
            category_name = c.title.replace('Category:', '')
            breadcumbs.append(category_name)
            sub_path = path / slugify(category_name)
            process_category(c, sub_path, breadcumbs)
        else:
            save_page(c, path, breadcumbs)

def download_wikipedia_category(category_name:str, out_dir:Path):
    """Download all pages from a Wikipedia category and its subcategories."""
    cat = wiki_wiki.page("Category:" + category_name)
    target = out_dir / slugify(category_name)
    process_category(cat, target)

if __name__ == '__main__':
    main()

No comments:

Post a Comment

Parse Wikipedia dump

""" This module processes Wikipedia dump files by extracting individual articles and parsing them into a structured format, ...