Source code for averell.core

import importlib
import logging
import os
from pathlib import Path

from slugify import slugify

from .utils import CORPORA_SOURCES
from .utils import download_corpora
from .utils import filter_corpus_features
from .utils import read_features
from .utils import write_json

DEFAULT_OUTPUT_FOLDER = Path.cwd() / "corpora"
logging.getLogger().setLevel(logging.INFO)


[docs]def get_corpora(corpus_indices=None, output_folder=DEFAULT_OUTPUT_FOLDER):
    """Download and uncompress selected corpora

    :param corpus_indices: Indices of the corpus that will be downloaded
    :param output_folder: Local folder where the corpus is going to be
        uncompressed
    :return: Python dict with all corpora features
    """
    corpora_features = []
    try:
        download_corpora(corpus_indices, output_folder)
        for index in corpus_indices:
            folder_name = CORPORA_SOURCES[index]["properties"]['slug']
            gen_path = Path(output_folder) / folder_name / "averell"
            get_features = getattr(importlib.import_module(
                CORPORA_SOURCES[index]["properties"]["reader"]), "get_features")
            features = get_features(Path(output_folder) / folder_name)
            for poem in features:
                # max_length=30 to avoid "too long file name" error
                author = slugify(poem["author"], max_length=30)
                author_path = gen_path / "parser" / author
                if not author_path.exists():
                    os.makedirs(author_path)
                write_json(poem, str(
                    author_path / slugify(poem["poem_title"], max_length=30)))
            corpora_features.append(features)
            logging.info(f"Downloaded {CORPORA_SOURCES[index]['name']} corpus")
    except IndexError:
        logging.error("Index number not in corpora list")
    finally:
        return corpora_features


[docs]def export_corpora(
    corpus_ids, granularity, corpora_folder, filename, no_download=False
):
    """
    Generates a single JSON file with the chosen granularity for all of the
        selected corpora

    :param corpus_ids: IDs of the corpora that will be exported
    :param granularity: Level of parsing granularity
    :param corpora_folder: Local folder where the corpora is located
    :param filename: Name of the output file
    :param no_download: Whether to download or not a corpora when missing
    :return: Python dict with the chosen granularity for all of the selected
        corpora
    """
    corpora_features = []
    slugs = []
    export_filename = filename
    if Path(corpora_folder).exists() or not no_download:
        if not corpus_ids:
            logging.error("No CORPUS ID selected")
        else:
            if granularity is not None:
                for corpus_id in corpus_ids:
                    try:
                        corpus = CORPORA_SOURCES[corpus_id]
                    except IndexError:
                        logging.error("ID not in corpora list")
                    else:
                        corpus_folder = corpus["properties"]["slug"]
                        slugs.append(corpus_folder)
                        corpus_name = corpus["name"]
                        if not (Path(corpora_folder) / corpus_folder).exists():
                            if not no_download:
                                get_corpora([corpus_id], corpora_folder)
                            else:
                                logging.error(
                                    f'"{corpus_name} ({corpus_folder})" not '
                                    f'found in "{corpora_folder}" folder')
                                continue
                        granularities_list = corpus["properties"]["granularity"]
                        if granularity not in granularities_list:
                            logging.error(
                                f"'{granularity}' granularity not found on "
                                f"'{corpus_name}' properties")
                            continue
                        features = read_features(
                            Path(corpora_folder) / corpus_folder)
                        filtered_features = filter_corpus_features(features,
                                                                   corpus_id,
                                                                   granularity)
                        corpora_features.extend(filtered_features)
            else:
                logging.error("No GRANULARITY selected")

        if not export_filename:
            export_filename = "_".join(slugs)
            export_filename = f"{export_filename}_{granularity}s"

        if corpora_features:
            write_json(corpora_features, export_filename)
    else:
        logging.error("Corpora folder not found")
    return corpora_features, export_filename