Source code for recom.data_retrieval

import os, re, time
import pandas as pd

import MySQLdb


[docs]def list2paper(l_paper, r_index=None, r_author=None, r_title=None, r_abstract=None, r_cite=None): """Transform a raw data paper (formatted as a list) into a dict. This function uses regular expression to match title, abstract, authors, etc. in each element of the list given in input. If a regex is None, then a default regex is used. Args: l_paper (list of strings): the list of elements forming the paper (title, authors, etc.), in raw format r_index (:class:`_sre.SRE_pattern`): a compiled regex to match an index string r_author (:class:`_sre.SRE_pattern`): a compiled regex to match an authors list r_title (:class:`_sre.SRE_pattern`): a compiled regex to match a title r_abstract (:class:`_sre.SRE_pattern`): a compiled regex to match an abstract r_cite (:class:`_sre.SRE_pattern`): a compiled regex to match a citation Returns: dict: the paper as a dict, with list of authors and list of citations """ p = {'index': None, 'authors': [], 'title': None, 'abstract': None, 'citations': []} if r_index is None: r_index = re.compile('^#index(.*)') if r_author is None: r_author = re.compile('^#@(.*)') if r_title is None: r_title = re.compile('^#\*(.*)') if r_abstract is None: r_abstract = re.compile('^#!(.*)') if r_cite is None: r_cite = re.compile('^#%(.*)') for s in l_paper: m_index = r_index.match(s) if m_index is not None: p['index'] = m_index.group(1) m_author = r_author.match(s) if m_author is not None: p['authors'] = [a.strip() for a in m_author.group(1).split(',')] m_title = r_title.match(s) if m_title is not None: p['title'] = m_title.group(1) m_abstract = r_abstract.match(s) if m_abstract is not None: p['abstract'] = m_abstract.group(1) m_cite = r_cite.match(s) if m_cite is not None: p['citations'].append(m_cite.group(1)) return p
[docs]def get_author_papers(author_name, author_slug, input_file): """Returns the list of papers written by the author (list of dicts) from a raw text file. The text file must be formatted in the following way: * each paper is a block of lines; * each line represents either the index, the title, the abstract, the list of authors or a citation reference; * there is a way to recognise the type of the line with a regular expression; * the papers are separated by a blank line. Args: author_name (string): the real name of the user author_slug (string): a short and ASCII string to replace the author's name input_file (string): the name of the file in which are stored the author's papers Returns: tuple: the author's papers as dictionaries: those with abstract and those without abstract """ # Split result into a list of lists (each sublist is a paper) papers = [] with open(input_file, 'r') as f: content = f.readlines() p = [] for l in content: if l.strip() != '': p.append(l) else: papers.append(p) p = [] papers = [list2paper(l) for l in papers] author_papers = [] papers_without_abstract = [] for p in papers: if author_name in p['authors']: if p['abstract'] is not None: author_papers.append(p) else: papers_without_abstract.append(p) return author_papers, papers_without_abstract
[docs]def generate_citations(author_papers): """Returns the citation relations. Args: author_papers (list of dicts): the author's papers, as a list of dicts produced by the function :func:`recom.data_retrieval.list2paper` Returns: :class:`pandas.DataFrame`: the citation relations """ citations = [] for p in author_papers: for c in p['citations']: citations.append([p['index'], c]) return pd.DataFrame(citations, columns=['citing', 'cited'])
[docs]def get_cited_papers(cited, db_cursor, papers_table='papers'): """Retrieves the cited papers data from a SQL database. The table ``papers_table`` must have the columns: ``id``, ``title`` and ``abstract``. Args: cited (list of strings): list of the cited papers' ids db_cursor (:class:`MySQLdb.cursors.Cursor`): cursor of a SQL database in which there is a papers table papers_table (string): name of the papers table in the SQL database Returns: tuple of tuples: the results of the SQL query """ # Select papers authored by user db_cursor.execute("SELECT id, title, abstract FROM papers p WHERE p.abstract != '' AND p.id IN (" + ','.join(["%s"] * len(cited)) + ")", tuple(cited)) return db_cursor.fetchall()
[docs]def get_irrelevant_papers(input_file): """Return the list of irrelevant papers written (list of dicts) from a raw text file. Args: input_file (string): relative path to the raw text file Returns: list of dicts: the list of irrelevant papers (with abstract) formatted as dicts """ # Split result into a list of lists (each sublist is a paper) papers = [] with open(input_file, 'r') as f: content = f.readlines() p = [] for l in content: if l.strip() != '': p.append(l) else: papers.append(p) p = [] papers = [list2paper(l) for l in papers] papers_with_abstract = [] for p in papers: if p['abstract'] is not None: papers_with_abstract.append(p) return papers_with_abstract
[docs]def get_irrelevant_cited_papers(bad_papers, db_cursor, papers_table='papers'): """Retrieves the papers cited by the irrelevant papers given in input, from a SQL database. Args: bad_papers (list of dicts): the list of irrelevant papers, formatted as the output of :func:`recom.data_retrieval.list2paper` db_cursor (:class:`MySQLdb.cursors.Cursor`): cursor of a SQL database in which there is a papers table papers_table (string): name of the papers table in the SQL database Returns: tuple of tuples: the results of the SQL query """ citations = [] for p in bad_papers: for c in p['citations']: citations.append([p['index'], c]) citations_df = pd.DataFrame(citations, columns=['citing', 'cited']) cited = citations_df['cited'].unique() db_cursor.execute("SELECT id, title, abstract FROM papers p WHERE p.abstract != '' AND p.id IN (" + ','.join(["%s"] * len(cited)) + ")", tuple(cited)) return db_cursor.fetchall()