Source code for recom.generate_dataset

import os, re, time
from argparse import ArgumentParser

import pandas as pd
import MySQLdb

from recom.data_retrieval import *
from recom.dataset_tools import *


[docs]def main(author_name, author_slug, author_papers_file, bad_papers_file, num_entries, db_name, output_file):
    """Given an author (name, papers), generates a dataset usable by the DSSM script.

    Args:
        author_name (string): the full name of the author
        author_slug (string): a short and ASCII string for the author's name (example: "Gabriella Pasi" -> "pasi")
        author_papers_file (string): the relative path to the file containing the raw data of the author's papers
        bad_papers_file (string): the relative path to the file containing the raw data of irrelevant papers
        num_entries (int): the number of compared papers in the DSSM structure (usually, 6)
        db_name (string): the name of the SQL database in which are stored all the papers
        output_file (string): the relative path to the file in which the dataset is saved
    """
    db = MySQLdb.connect(user='root', passwd='root', db=db_name)
    c = db.cursor()

    # We parse the file containing the author's papers and generate the citations file
    print("Parsing author's papers file")
    author_papers, _ = get_author_papers(author_name, author_slug, author_papers_file)
    citations = generate_citations(author_papers)

    # We retrieve the cited papers from the SQL database
    print("Retrieving cited papers from SQL database")
    cited = citations['cited'].unique()
    cited_papers = get_cited_papers(cited, c)

    # We parse the file containing the irrelevant papers, then we retrieve the cited papers from the db
    print("Parsing irrelevant papers file")
    bad_papers = get_irrelevant_papers(bad_papers_file)
    print("Retrieving irrelevant papers citations from SQL database")
    bad_cited_papers = get_irrelevant_cited_papers(bad_papers, c)

    # We reformat everything as tuples
    authors_papers = tuple([(p['index'], p['title'], p['abstract']) for p in author_papers])
    cites = tuple([(c[0], c[1]) for c in citations.as_matrix()])
    bad_papers = tuple([(p['index'], p['title'], p['abstract']) for p in (bad_papers + bad_cited_papers)])

    print("")

    # Generate global vocabulary
    print("Generating vocabulary")
    author_vocab = generate_vocab(author_papers)
    global_vocab = generate_vocab(bad_papers)
    tokens = list(set(author_vocab + global_vocab))

    print("")

    print("Preparing dataset...")
    papers_feat, citations_feat, bad_feat, ngrams = prepare_dataset(author_papers, cites, cited_papers, bad_papers, tokens)

    print("Building computable dataset...")
    inputs = build_dataset(papers_feat, citations_feat, bad_feat, num_entries)

    print("")

    print("Saving dataset to file: " + output_file + ".npz")
    dataset_to_file(inputs, ngrams, output_file)

    print("Done.")


if __name__ == '__main__':
    # Parse command
    usage = "usage: %prog [options] args"

    parser = ArgumentParser()

    parser.add_argument("-n", "--name", type=str, help="Name of the author")
    parser.add_argument("-s", "--slug", type=str, help="Short ASCII string for the author's name")
    parser.add_argument("-af", "--author_file", type=str, help="Path to the author's papers file")
    parser.add_argument("-bf", "--irrelevant_file", type=str, help="Path to the irrelevant papers file")
    parser.add_argument("-c", "--num_compare", type=int, help="The number of irrelevant papers to compare the user's papers with, in the DSSM structure")
    parser.add_argument("-d", "--db_name", type=str, help="Name of the SQL database")
    parser.add_argument("-o", "--output_filename", type=str, help="Path to the output dataset file")

    args = parser.parse_args()

    kwargs = {'author_name': args.name,
              'author_slug': args.slug,
              'author_papers_file': args.author_file,
              'bad_papers_file': args.irrelevant_file,
              'num_entries': args.num_compare+2,
              'db_name': args.db_name,
              'output_file': args.output_filename}

    main(**kwargs)