import wikipediaapi
import logging
import json
from pathlib import Path
from slugify import slugify
import click
logger = logging.getLogger(__name__)
wiki_wiki = wikipediaapi.Wikipedia('parasort (tom-010@web.de)', 'en')
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
@click.command()
@click.argument('category', type=str)
@click.option('--out', type=click.Path(path_type=Path), default='out', help='Output directory')
def main(category:str, out:Path):
download_wikipedia_category(category, out)
def save_page(page, path:Path, breadcumbs: list[str]):
"""Save the content of a Wikipedia page into a file."""
logger.info(f'Saving page: {page.title}')
path.mkdir(parents=True, exist_ok=True)
target = path / f'{slugify(page.title)}.json'
if target.exists():
logger.info(f'Page already exists: {page.title}')
return
page_json = {
'title': page.title,
'summary': page.summary,
'pageid': page.pageid,
'breadcumbs': breadcumbs,
'sections': []
}
for section in page.sections:
page_json['sections'].append({
'level': section.level,
'title': section.title,
'text': section.full_text()
})
content = page.text
if content:
with target.open('w', encoding='utf-8') as f:
json.dump(page_json, f, ensure_ascii=False, indent=2)
def process_category(category, path:Path, breadcumbs: list[str] = None):
"""Process each category and its subcategories."""
if breadcumbs is None:
breadcumbs = []
logger.info(f'Processing category: {category}')
for c in category.categorymembers.values():
if c.ns == wikipediaapi.Namespace.CATEGORY:
# Create a directory for the subcategory
category_name = c.title.replace('Category:', '')
breadcumbs.append(category_name)
sub_path = path / slugify(category_name)
process_category(c, sub_path, breadcumbs)
else:
save_page(c, path, breadcumbs)
def download_wikipedia_category(category_name:str, out_dir:Path):
"""Download all pages from a Wikipedia category and its subcategories."""
cat = wiki_wiki.page("Category:" + category_name)
target = out_dir / slugify(category_name)
process_category(cat, target)
if __name__ == '__main__':
main()
Tuesday, 20 February 2024
Download a Wikipedia Category
Subscribe to:
Post Comments (Atom)
Parse Wikipedia dump
""" This module processes Wikipedia dump files by extracting individual articles and parsing them into a structured format, ...
-
""" This module processes Wikipedia dump files by extracting individual articles and parsing them into a structured format, ...
-
Der Kollektivgeist: Ihr intelligentes Unternehmensgedächtnis Wissen aus den Köpfen der Mitarbeiter extrahieren - Die ...
-
2 Million renderings took 0.875s import java.io.IOException; import java.io.StringWriter; import java.io.Writer; import java.util.HashMap;...
No comments:
Post a Comment