import wikipediaapi import logging import json from pathlib import Path from slugify import slugify import click logger = logging.getLogger(__name__) wiki_wiki = wikipediaapi.Wikipedia('parasort (tom-010@web.de)', 'en') logging.basicConfig() logging.getLogger().setLevel(logging.INFO) @click.command() @click.argument('category', type=str) @click.option('--out', type=click.Path(path_type=Path), default='out', help='Output directory') def main(category:str, out:Path): download_wikipedia_category(category, out) def save_page(page, path:Path, breadcumbs: list[str]): """Save the content of a Wikipedia page into a file.""" logger.info(f'Saving page: {page.title}') path.mkdir(parents=True, exist_ok=True) target = path / f'{slugify(page.title)}.json' if target.exists(): logger.info(f'Page already exists: {page.title}') return page_json = { 'title': page.title, 'summary': page.summary, 'pageid': page.pageid, 'breadcumbs': breadcumbs, 'sections': [] } for section in page.sections: page_json['sections'].append({ 'level': section.level, 'title': section.title, 'text': section.full_text() }) content = page.text if content: with target.open('w', encoding='utf-8') as f: json.dump(page_json, f, ensure_ascii=False, indent=2) def process_category(category, path:Path, breadcumbs: list[str] = None): """Process each category and its subcategories.""" if breadcumbs is None: breadcumbs = [] logger.info(f'Processing category: {category}') for c in category.categorymembers.values(): if c.ns == wikipediaapi.Namespace.CATEGORY: # Create a directory for the subcategory category_name = c.title.replace('Category:', '') breadcumbs.append(category_name) sub_path = path / slugify(category_name) process_category(c, sub_path, breadcumbs) else: save_page(c, path, breadcumbs) def download_wikipedia_category(category_name:str, out_dir:Path): """Download all pages from a Wikipedia category and its subcategories.""" cat = wiki_wiki.page("Category:" + category_name) target = out_dir / slugify(category_name) process_category(cat, target) if __name__ == '__main__': main()
Tuesday, 20 February 2024
Download a Wikipedia Category
Subscribe to:
Post Comments (Atom)
Parse Wikipedia dump
""" This module processes Wikipedia dump files by extracting individual articles and parsing them into a structured format, ...
-
Der Kollektivgeist: Ihr intelligentes Unternehmensgedächtnis Wissen aus den Köpfen der Mitarbeiter extrahieren - Die ...
-
docker pull quay.io/unstructured-io/unstructured-api 20gb image. After docker-pull: docker image inspect --format '{{json .}}' ...
No comments:
Post a Comment