Source code for dataset_tools

import random, time, re, MySQLdb

import pandas as pd
import numpy as np
from scipy import sparse
import nltk

from string_tools import StringHasher, StringCleaner


[docs]def generate_vocab(papers): """Returns the vocabulary used in the papers given in parameters, after cleaning and stopwords removal. Args: papers (list of tuples): the raw list of papers from which generates the vocabulary (each element is a tuple of 3 strings: id, title and abstract) Returns: list of strings: the list of tokens forming the vocabulary """ sc = StringCleaner() # Generate author's vocabulary corpus = " ".join(p[1] + " " + p[2] for p in papers) # Cleaning corpus = sc.clean_string(corpus) # Tokenization pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?" # we keep tokens that are words (with optional internal hyphens), acronyms and percentages tokens = set(nltk.regexp_tokenize(corpus, pattern)) - set(nltk.corpus.stopwords.words("english")) num_re = re.compile("^\d+$") tokens = set([t for t in tokens if not num_re.match(t)]) # we remove only-numeric tokens # Stemming porter = nltk.stem.PorterStemmer() return [porter.stem(t) for t in tokens]
[docs]def compute_features(papers, stringHasher, verbosity=1): """Computes the features of a list of papers, with a given list of ngrams. Args: papers (list of tuples): the list of papers (each element is a tuple of 3 strings: id, title, abstract) stringHasher (:class:`string_tools.StringHasher`): the object which contains the list of ngrams verbosity (int): 0: quiet; 1: normal; 2: high Returns: dict of :class:`np.ndarray`: the list of papers represented as bag-of-words vectors """ sc = StringCleaner() sh = stringHasher papers_feat = {} total = len(papers) start_time = time.time() i = 1 for p_id, p_title, p_abstract in papers: title = sc.clean_string(p_title) abstract = sc.clean_string(p_abstract) to_hash = title + " " + abstract papers_feat[p_id] = sh.hash(to_hash) if verbosity > 1 and i % 100 == 0: print("Paper %d over %d" % (i, total)) i += 1 if verbosity > 0: print("Processed {} papers in {:.3f}s".format(len(papers), time.time() - start_time)) return papers_feat
[docs]def invert_citations(citations, verbosity=1): """Transforms a list of citation relations into a hashtable cited_paper -> list of citing papers. Args: citations (list of tuples): the list of citation relations (each element is a tuple of 2 string: *citing* paper's id, *cited* paper's id) verbosity (int): 0: quiet; 1: normal; 2: high Returns: dict: a dict whose keys are cited papers ids and whose values are the lists of the ids of the papers that cite the keys (string -> list of strings) """ citations_assoc = {} total = len(citations) i = 1 start_time = time.time() for (cited_by, cited_paper) in citations: if cited_paper in citations_assoc.keys(): citations_assoc[cited_paper].append(cited_by) else: citations_assoc[cited_paper] = [cited_by] if verbosity > 1 and i % 500 == 0: print("Citation %d over %" % (i, total)) i += 1 if verbosity > 0: print("Processed {} citation relations in {:.3f}s".format(len(citations), time.time() - start_time)) return citations_assoc
[docs]def prepare_dataset(user_papers, citations, cited_papers, tokens, bad_papers=None, verbosity=1): """Prepares data from string representations of papers in order to buidl a numeric dataset. The result is a tuple of 4 elements: (1) the user's papers, as a dictionary: each key is a the id of a paper written by the user, and the value is the features of the paper (1D np.ndarray), (2) the cited papers, as a dictionary: each key is the id of a paper cited by the user, and the value is a tuple constituted of the list of papers id in which the paper is cited (list of strings), and the features of the paper (1D np.ndarray), (3) the irrelevant papers, as a dictionary like the first one, (4) the ngrams used to compute the features (list of strings). Args: user_papers (list of 3-tuples): the papers written by the user (each element is a tuple of 3 strings: id, title, abstract) citations (list of 2-tuples): the list of citation relations cited_papers (list of 3-tuples): the papers that the user has cited (each element is a tuple of 3 strings: id, title, abstract) tokens (list of strings): the vocabulary to use for computing features bad_papers (list of 3-tuples or None): unrelated papers (each element is a tuple of 3 strings: id, title, abstract) verbosity (int): 0: quiet; 1: normal; 2: high Returns: tuple: data to build a dataset with """ # Verbosity: 0 = None, 1 = Few details, 2 = Much details sh = StringHasher() sc = StringCleaner() # Initiate the ngrams (specific to the author) sh.init_ngrams(tokens) # Hash user's papers' titles and abstracts papers_feat = compute_features(user_papers, sh, verbosity) citations_assoc = invert_citations(citations, verbosity) # Hash cited papers' titles and abstracts citations_feat = {} total = len(cited_papers) i = 1 start_time = time.time() for (p_id, p_title, p_abstract) in cited_papers: title = sc.clean_string(p_title) abstract = sc.clean_string(p_abstract) to_hash = title + " " + abstract citations_feat[p_id] = (citations_assoc[p_id], sh.hash(to_hash)) # [, np.ndarray] if verbosity > 1 and i % 500 == 0: print("Cited paper %d over %" % (i, total)) i += 1 if verbosity > 0: print("Processed {} cited papers in {:.3f}s".format(len(cited_papers), time.time() - start_time)) if bad_papers is None: if verbosity > 0: print("Done.") return papers_feat, citations_feat, None, sh.ngrams # Hash bad papers' titles and abstracts bad_feat = compute_features(bad_papers, sh, verbosity) if verbosity > 0: print("Done.") # we also return the author's specific list of ngrams (for future hashing) return papers_feat, citations_feat, bad_feat, sh.ngrams
[docs]def build_dataset(papers, citations, bad_papers, num_entries=6, verbosity=1): """Build a dataset from features variables. Args: papers (dict): features of the user's papers (dict string -> np.ndarray) citations (dict): features of cited papers (dict string -> tuple(list of string, np.ndarray)) bad_papers (dict): features of unrelated papers (dict string -> np.ndarray) num_entries (int): the number of compared papers in the DSSM structure verbosity (int): 0: quiet; 1: normal; 2: high Returns: :class:`np.ndarray`: the dataset """ start_time = time.time() # Number of non citing papers needed to complete one block of dataset num_others = num_entries - 2 # Init result reps = 4 num_samples = reps*len(citations) num_feats = list(papers.values())[0].shape[0] dataset = np.empty((num_samples, num_entries, num_feats)) sample = 0 acceptable_idx = set(bad_papers.keys()) for rep in range(reps): for c_id, val in citations.items(): dataset[sample][0] = val[1] # features of the cited paper citing_papers = val[0] # indexes of papers that cite c_id # select the features of one of these papers, randomly one_citing_paper = citing_papers[random.randrange(len(citing_papers))] dataset[sample][1] = papers[one_citing_paper] # select num_others "bad" papers selected_idx = random.sample(acceptable_idx, num_others) for i in range(num_others): dataset[sample][2+i] = bad_papers[selected_idx[i]] sample += 1 if verbosity > 1 and sample % 500 == 0: print("Sample {} over {}".format(sample, num_samples)) if verbosity > 0: print("Generated dataset with {} samples in {:.3f}s".format(num_samples, time.time() - start_time)) return dataset
[docs]def dataset_to_file(dataset, ngrams, filename='dataset'): """Save a dataset to a file. Args: dataset (:class:`np.ndarray`): the dataset to save (built with :func:`dataset_tools.build_dataset`) ngrams (list of strings): the ngrams used to compute the features filename (string): the filename without extension (will be .npz) """ num_samples, num_entries, num_features = dataset.shape # We rehaspe the ndarray from 3D to 2D in order to write it into a text file # Each line of the file will correspond to one cited paper # Therefore, on each there will be the `num_entries` sets of features dataset_sp = sparse.csr_matrix(dataset.reshape(num_samples*num_entries, num_features)) np.savez(filename, num_entries=np.array([num_entries]), data=dataset_sp.data, indices=dataset_sp.indices, indptr=dataset_sp.indptr, shape=dataset_sp.shape, ngrams=ngrams)
[docs]def dataset_from_file(filename): """Load a dataset from file. Args: filename (string): the name of the file from which extract the dataset Returns: tuple: the dataset (np.ndarray) and the ngrams (list of strings) """ loader = np.load(filename) num_entries = loader['num_entries'][0] sp_dataset = sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape']) dataset = sp_dataset.toarray() samp_entries, num_features = dataset.shape return dataset.reshape(int(samp_entries / num_entries), num_entries, num_features), loader['ngrams']