Source code for averell.core

import importlib
import logging
import os
from pathlib import Path

from slugify import slugify

from .utils import CORPORA_SOURCES
from .utils import download_corpora
from .utils import filter_corpus_features
from .utils import read_features
from .utils import write_json

DEFAULT_OUTPUT_FOLDER = Path.cwd() / "corpora"
logging.getLogger().setLevel(logging.INFO)


[docs]def get_corpora(corpus_indices=None, output_folder=DEFAULT_OUTPUT_FOLDER): """Download and uncompress selected corpora :param corpus_indices: Indices of the corpus that will be downloaded :param output_folder: Local folder where the corpus is going to be uncompressed :return: Python dict with all corpora features """ corpora_features = [] try: download_corpora(corpus_indices, output_folder) for index in corpus_indices: folder_name = CORPORA_SOURCES[index]["properties"]['slug'] gen_path = Path(output_folder) / folder_name / "averell" get_features = getattr(importlib.import_module( CORPORA_SOURCES[index]["properties"]["reader"]), "get_features") features = get_features(Path(output_folder) / folder_name) for poem in features: # max_length=30 to avoid "too long file name" error author = slugify(poem["author"], max_length=30) author_path = gen_path / "parser" / author if not author_path.exists(): os.makedirs(author_path) write_json(poem, str( author_path / slugify(poem["poem_title"], max_length=30))) corpora_features.append(features) logging.info(f"Downloaded {CORPORA_SOURCES[index]['name']} corpus") except IndexError: logging.error("Index number not in corpora list") finally: return corpora_features
[docs]def export_corpora( corpus_ids, granularity, corpora_folder, filename, no_download=False ): """ Generates a single JSON file with the chosen granularity for all of the selected corpora :param corpus_ids: IDs of the corpora that will be exported :param granularity: Level of parsing granularity :param corpora_folder: Local folder where the corpora is located :param filename: Name of the output file :param no_download: Whether to download or not a corpora when missing :return: Python dict with the chosen granularity for all of the selected corpora """ corpora_features = [] slugs = [] export_filename = filename if Path(corpora_folder).exists() or not no_download: if not corpus_ids: logging.error("No CORPUS ID selected") else: if granularity is not None: for corpus_id in corpus_ids: try: corpus = CORPORA_SOURCES[corpus_id] except IndexError: logging.error("ID not in corpora list") else: corpus_folder = corpus["properties"]["slug"] slugs.append(corpus_folder) corpus_name = corpus["name"] if not (Path(corpora_folder) / corpus_folder).exists(): if not no_download: get_corpora([corpus_id], corpora_folder) else: logging.error( f'"{corpus_name} ({corpus_folder})" not ' f'found in "{corpora_folder}" folder') continue granularities_list = corpus["properties"]["granularity"] if granularity not in granularities_list: logging.error( f"'{granularity}' granularity not found on " f"'{corpus_name}' properties") continue features = read_features( Path(corpora_folder) / corpus_folder) filtered_features = filter_corpus_features(features, corpus_id, granularity) corpora_features.extend(filtered_features) else: logging.error("No GRANULARITY selected") if not export_filename: export_filename = "_".join(slugs) export_filename = f"{export_filename}_{granularity}s" if corpora_features: write_json(corpora_features, export_filename) else: logging.error("Corpora folder not found") return corpora_features, export_filename