Source code for recom.train_dssm

#!/usr/bin/env python
# coding=utf-8

import sys
import os
import time
from argparse import ArgumentParser

import numpy as np
import theano
import theano.tensor as T
import lasagne

from recom.dssm import *
from recom.dataset_tools import *


[docs]def main(author_id=None, num_epochs=100, num_entries=6, num_hid1=300, num_hid2=300, num_out=128, learning_rate=0.1, input_file=None, output_file='output'): """Builds a DSSM and trains it with the dataset of the given author. The DSSM parameters are saved into a file, in order to be used for recommendation. You must specify either the author's id, or the dataset input file. Args: author_id (int or None): id of the author in the SQL database num_epochs (int): number of iterations in the training num_entries (int): number of compared papers in the DSSM structure num_hid1 (int): number of units in the first hidden layer num_hid2 (int): number of units in the second hidden layer num_out (int): number of units in the output layer learning_rate (float): parameter of the SGD training algorithm input_file (string or None): path to the dataset file of the author output_file (string): path to the output file (DSSM parameters) """ if author_id is None and input_file is None: return 1 # The input should be a tensor (3D np.array): each matrix (2D np.array) in this tensor has the structure: # [x (cited paper), p+ (citing paper), p1-, ..., pn- (non-citing papers)] if author_id is None: print("Retrieving dataset from file...") inputs, ngrams = dataset_from_file(input_file) else: print("Retrieving data from SQL DB...") #user_papers, citations, cited_papers = retrieve_data(author_id) # TODO: remove that print("Preparing dataset...") papers_feat, citations_feat, ngrams = prepare_dataset(user_papers, citations, cited_papers) print("Building computable dataset...") inputs = build_dataset(papers_feat, citations_feat, num_entries) # Build a DSSM (with several entries) print("Building the DSSM structure...") num_samples = 200 gamma = 500 input_var = T.matrix() network = build_multi_dssm(input_var=input_var, num_samples=num_samples, num_entries=num_entries, num_ngrams=len(ngrams), num_hid1=num_hid1, num_hid2=num_hid2, num_out=num_out) prediction = lasagne.layers.get_output(network) # Post-NN operations to compute the loss loss = compute_loss(prediction, num_samples, num_entries, gamma) # NN train function params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adadelta(loss, params, learning_rate=learning_rate) train_fn = theano.function([input_var], loss, updates=updates) # Let's train our network loss_values = np.zeros(num_epochs) print("Beginning of DSSM training...") for epoch in range(num_epochs): train_err = 0 train_batches = 0 start_time = time.time() for batch_ in iterate_minibatches(inputs, num_samples, shuffle=True): batch = np.reshape(batch_, (num_entries * num_samples, len(ngrams))) train_err += train_fn(batch) train_batches += 1 loss_values[epoch] = np.float64(train_err / train_batches) print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(np.float64(train_err / train_batches))) # We save the DSSM parameters dssm_struct = { 'num_entries': num_entries, 'num_hid1': num_hid1, 'num_hid2': num_hid2, 'num_out': num_out, 'learning_rate': learning_rate, 'gamma': gamma } np.savez(output_file, dssm=lasagne.layers.get_all_param_values(network), dssm_struct=dssm_struct, losses=loss_values)
if __name__ == '__main__': # Parse command usage = "usage: %prog [options] args" parser = ArgumentParser() parser.add_argument("-e", "--epochs", type=int, default=10, help="Number of iterations for the training") parser.add_argument("-n1", "--num_hidden1", dest="num_hid1", type=int, default=300, help="Number of units in the first hidden layer") parser.add_argument("-n2", "--num_hidden2", dest="num_hid2", type=int, default=300, help="Number of units in the second hidden layer") parser.add_argument("-no", "--num_out", dest="num_out", type=int, default=18, help="Number of units in the output layer") parser.add_argument("-c", "--num_compare", dest="num_compare", type=int, default=4, help="Number of non relevant papers to consider for training") parser.add_argument("-r", "--rate", dest="learning_rate", type=float, default=0.1, help="Learning rate for the SGD") parser.add_argument("-o", "--output", dest="output_filename", type=str, help="Filename for output") group = parser.add_mutually_exclusive_group() group.add_argument("-a", "--author", dest="author", type=int, help="ID of the user in the database") group.add_argument("-i", "--input", dest="input_filename", type=str, help="Filename for dataset input") args = parser.parse_args() if not args.author and not args.input_filename: parser.print_help() kwargs = {'num_epochs': args.epochs, 'num_hid1': args.num_hid1, 'num_hid2': args.num_hid2, 'num_out': args.num_out, 'num_entries': args.num_compare+2, 'learning_rate': args.learning_rate, 'input_file': args.input_filename, 'output_file': args.output_filename} if not args.author: kwargs['author_id'] = None else: kwargs['author_id'] = args.author main(**kwargs)