#!/usr/bin/env python
# coding=utf-8
import sys
import os
import time
from argparse import ArgumentParser
import numpy as np
import theano
import theano.tensor as T
import lasagne
from lasagne import layers, init, nonlinearities
from dataset_tools import *
from dssm import *
from string_tools import *
from data_retrieval import *
[docs]def compute_features_batch(papers, ngrams=None, verbosity=1):
"""Compute the features of the given list of papers, w.r.t. the ngrams.
Args:
papers (list of dicts): the list of papers whose features are to be computed
ngrams (list of strings): the n-grams with which we compute the features
verbosity (int): 0: quiet, 1: normal, 2: high
Returns:
dict: the features of each paper, identified by its id
"""
sh = StringHasher()
sc = StringCleaner()
if ngrams is None:
# Generate author's vocabulary
tokens = generate_vocab(papers)
# Initiate the ngrams (specific to the author)
sh.init_ngrams(tokens)
else:
sh.load_ngrams(ngrams)
# Hash user's papers' titles and abstracts
papers_feat = {}
total = len(papers)
i = 1
start_time = time.time()
for p in papers:
title = sc.clean_string(p['title'])
abstract = sc.clean_string(p['abstract'])
to_hash = title + " " + abstract
papers_feat[p['index']] = sh.hash(to_hash)
if verbosity > 1 and i % 100 == 0:
print("Paper %d over %d" % (i, total))
i += 1
return papers_feat
[docs]def main(dataset, author_papers_file, author_name, author_slug, author_dssm, unseen_papers):
"""Given a stream of unseen papers, decides if each paper should be recommended or not.
Args:
dataset (string): path to the user's dataset
author_papers_file (string): path to the user's papers file (raw text file)
author_name (string): author's full name
author_slug (string): short and ASCII string for the author's name
author_dssm (string): path to the trained DSSM's parameters file
unseen_papers (string): path to the raw file containing unseen papers' titles and abstracts
"""
# Load DSSM params and user's papers (raw data)
papers_loader = np.load(dataset)
ngrams = papers_loader['ngrams']
num_entries = papers_loader['num_entries'][0]
user_papers_raw, _ = get_author_papers(author_name, author_slug, author_papers_file)
papers_feat = compute_features_batch(user_papers_raw, ngrams)
# Build DSSM, load params and compute user's papers projections
num_samples = 1
dssm_loader = np.load(author_dssm)
dssm_struct = dssm_loader['dssm_struct'].reshape(1, -1)[0, 0]
num_hid1 = dssm_struct['num_hid1']
num_hid2 = dssm_struct['num_hid2']
num_out = dssm_struct['num_out']
gamma = dssm_struct['gamma']
input_var = T.matrix()
dssm_values = dssm_loader['dssm']
network = build_multi_dssm(input_var=input_var,
num_samples=num_samples,
num_entries=num_entries,
num_ngrams=len(ngrams),
num_hid1=num_hid1,
num_hid2=num_hid2,
num_out=num_out)
lasagne.layers.set_all_param_values(network, dssm_values)
prediction = lasagne.layers.get_output(network, deterministic=True)
output = prediction / prediction.norm(L=2)
f = theano.function([input_var], output)
user_papers = [f(x.reshape(1, -1))[0] for _, x in papers_feat.items()]
# Compute scores for the unseen papers
r_index = re.compile('^#index(.*)')
r_author = re.compile('^#@(.*)')
r_title = re.compile('^#\*(.*)')
r_abstract = re.compile('^#!(.*)')
r_cite = re.compile('^#%(.*)')
unseen_papers_raw = get_irrelevant_papers(unseen_papers)
unseen_papers = [list2paper(p, r_index, r_author, r_title, r_abstract, r_cite) for p in unseen_papers_raw]
unseen_feats = compute_features_batch(unseen_papers, ngrams)
# Compute similarities
sims = [np.array([np.dot(paper, y)[0] for y in user_papers]) for paper in unseen_papers]
if __name__ == '__main__':
# Parse command
usage = "usage: %prog [options] args"
parser = ArgumentParser()
parser.add_argument("-d", "--dataset", type=str, help="Path to the user dataset")
parser.add_argument("-n", "--name", type=str, help="Name of the author")
parser.add_argument("-s", "--slug", type=str, help="Short ASCII string for the author's name")
parser.add_argument("-af", "--author_file", type=str, help="Path to the author's papers file")
parser.add_argument("-v", "--dssm_values", type=str, help="Path to the trained DSSM's parameters file")
parser.add_argument("-u", "--unseen_papers", type=str, help="Path to the unseen papers file")
args = parser.parse_args()
kwargs = {'dataset': args.dataset,
'author_papers_file': args.author_file,
'author_name': args.name,
'author_slug': args.slug,
'author_dssm': args.dssm_values,
'unseen_papers': args.unseen_papers}
main(**kwargs)