Source code for recom.generate_dataset

import os, re, time
from argparse import ArgumentParser

import pandas as pd
import MySQLdb

from recom.data_retrieval import *
from recom.dataset_tools import *


[docs]def main(author_name, author_slug, author_papers_file, bad_papers_file, num_entries, db_name, output_file): """Given an author (name, papers), generates a dataset usable by the DSSM script. Args: author_name (string): the full name of the author author_slug (string): a short and ASCII string for the author's name (example: "Gabriella Pasi" -> "pasi") author_papers_file (string): the relative path to the file containing the raw data of the author's papers bad_papers_file (string): the relative path to the file containing the raw data of irrelevant papers num_entries (int): the number of compared papers in the DSSM structure (usually, 6) db_name (string): the name of the SQL database in which are stored all the papers output_file (string): the relative path to the file in which the dataset is saved """ db = MySQLdb.connect(user='root', passwd='root', db=db_name) c = db.cursor() # We parse the file containing the author's papers and generate the citations file print("Parsing author's papers file") author_papers, _ = get_author_papers(author_name, author_slug, author_papers_file) citations = generate_citations(author_papers) # We retrieve the cited papers from the SQL database print("Retrieving cited papers from SQL database") cited = citations['cited'].unique() cited_papers = get_cited_papers(cited, c) # We parse the file containing the irrelevant papers, then we retrieve the cited papers from the db print("Parsing irrelevant papers file") bad_papers = get_irrelevant_papers(bad_papers_file) print("Retrieving irrelevant papers citations from SQL database") bad_cited_papers = get_irrelevant_cited_papers(bad_papers, c) # We reformat everything as tuples authors_papers = tuple([(p['index'], p['title'], p['abstract']) for p in author_papers]) cites = tuple([(c[0], c[1]) for c in citations.as_matrix()]) bad_papers = tuple([(p['index'], p['title'], p['abstract']) for p in (bad_papers + bad_cited_papers)]) print("") # Generate global vocabulary print("Generating vocabulary") author_vocab = generate_vocab(author_papers) global_vocab = generate_vocab(bad_papers) tokens = list(set(author_vocab + global_vocab)) print("") print("Preparing dataset...") papers_feat, citations_feat, bad_feat, ngrams = prepare_dataset(author_papers, cites, cited_papers, bad_papers, tokens) print("Building computable dataset...") inputs = build_dataset(papers_feat, citations_feat, bad_feat, num_entries) print("") print("Saving dataset to file: " + output_file + ".npz") dataset_to_file(inputs, ngrams, output_file) print("Done.")
if __name__ == '__main__': # Parse command usage = "usage: %prog [options] args" parser = ArgumentParser() parser.add_argument("-n", "--name", type=str, help="Name of the author") parser.add_argument("-s", "--slug", type=str, help="Short ASCII string for the author's name") parser.add_argument("-af", "--author_file", type=str, help="Path to the author's papers file") parser.add_argument("-bf", "--irrelevant_file", type=str, help="Path to the irrelevant papers file") parser.add_argument("-c", "--num_compare", type=int, help="The number of irrelevant papers to compare the user's papers with, in the DSSM structure") parser.add_argument("-d", "--db_name", type=str, help="Name of the SQL database") parser.add_argument("-o", "--output_filename", type=str, help="Path to the output dataset file") args = parser.parse_args() kwargs = {'author_name': args.name, 'author_slug': args.slug, 'author_papers_file': args.author_file, 'bad_papers_file': args.irrelevant_file, 'num_entries': args.num_compare+2, 'db_name': args.db_name, 'output_file': args.output_filename} main(**kwargs)