import os, re, time
import pandas as pd
import MySQLdb
[docs]def list2paper(l_paper, r_index=None, r_author=None, r_title=None, r_abstract=None, r_cite=None):
"""Transform a raw data paper (formatted as a list) into a dict.
This function uses regular expression to match title, abstract, authors, etc. in each element of the list given in input.
If a regex is None, then a default regex is used.
l_paper (list of strings): the list of elements forming the paper (title, authors, etc.), in raw format
r_index (:class:`_sre.SRE_pattern`): a compiled regex to match an index string
r_author (:class:`_sre.SRE_pattern`): a compiled regex to match an authors list
r_title (:class:`_sre.SRE_pattern`): a compiled regex to match a title
r_abstract (:class:`_sre.SRE_pattern`): a compiled regex to match an abstract
r_cite (:class:`_sre.SRE_pattern`): a compiled regex to match a citation
dict: the paper as a dict, with list of authors and list of citations
p = {'index': None, 'authors': [], 'title': None, 'abstract': None, 'citations': []}
if r_index is None:
r_index = re.compile('^#index(.*)')
if r_author is None:
r_author = re.compile('^#@(.*)')
if r_title is None:
r_title = re.compile('^#\*(.*)')
if r_abstract is None:
r_abstract = re.compile('^#!(.*)')
if r_cite is None:
r_cite = re.compile('^#%(.*)')
for s in l_paper:
m_index = r_index.match(s)
if m_index is not None:
p['index'] =
m_author = r_author.match(s)
if m_author is not None:
p['authors'] = [a.strip() for a in',')]
m_title = r_title.match(s)
if m_title is not None:
p['title'] =
m_abstract = r_abstract.match(s)
if m_abstract is not None:
p['abstract'] =
m_cite = r_cite.match(s)
if m_cite is not None:
return p
[docs]def get_author_papers(author_name, author_slug, input_file):
"""Returns the list of papers written by the author (list of dicts) from a raw text file.
The text file must be formatted in the following way:
* each paper is a block of lines;
* each line represents either the index, the title, the abstract, the list of authors or a citation reference;
* there is a way to recognise the type of the line with a regular expression;
* the papers are separated by a blank line.
author_name (string): the real name of the user
author_slug (string): a short and ASCII string to replace the author's name
input_file (string): the name of the file in which are stored the author's papers
tuple: the author's papers as dictionaries: those with abstract and those without abstract
# Split result into a list of lists (each sublist is a paper)
papers = []
with open(input_file, 'r') as f:
content = f.readlines()
p = []
for l in content:
if l.strip() != '':
p = []
papers = [list2paper(l) for l in papers]
author_papers = []
papers_without_abstract = []
for p in papers:
if author_name in p['authors']:
if p['abstract'] is not None:
return author_papers, papers_without_abstract
[docs]def generate_citations(author_papers):
"""Returns the citation relations.
author_papers (list of dicts): the author's papers, as a list of dicts produced by the function :func:`recom.data_retrieval.list2paper`
:class:`pandas.DataFrame`: the citation relations
citations = []
for p in author_papers:
for c in p['citations']:
citations.append([p['index'], c])
return pd.DataFrame(citations, columns=['citing', 'cited'])
[docs]def get_cited_papers(cited, db_cursor, papers_table='papers'):
"""Retrieves the cited papers data from a SQL database.
The table ``papers_table`` must have the columns: ``id``, ``title`` and ``abstract``.
cited (list of strings): list of the cited papers' ids
db_cursor (:class:`MySQLdb.cursors.Cursor`): cursor of a SQL database in which there is a papers table
papers_table (string): name of the papers table in the SQL database
tuple of tuples: the results of the SQL query
# Select papers authored by user
db_cursor.execute("SELECT id, title, abstract FROM papers p WHERE p.abstract != '' AND IN (" + ','.join(["%s"] * len(cited)) + ")", tuple(cited))
return db_cursor.fetchall()
[docs]def get_irrelevant_papers(input_file):
"""Return the list of irrelevant papers written (list of dicts) from a raw text file.
input_file (string): relative path to the raw text file
list of dicts: the list of irrelevant papers (with abstract) formatted as dicts
# Split result into a list of lists (each sublist is a paper)
papers = []
with open(input_file, 'r') as f:
content = f.readlines()
p = []
for l in content:
if l.strip() != '':
p = []
papers = [list2paper(l) for l in papers]
papers_with_abstract = []
for p in papers:
if p['abstract'] is not None:
return papers_with_abstract
[docs]def get_irrelevant_cited_papers(bad_papers, db_cursor, papers_table='papers'):
"""Retrieves the papers cited by the irrelevant papers given in input, from a SQL database.
bad_papers (list of dicts): the list of irrelevant papers, formatted as the output of :func:`recom.data_retrieval.list2paper`
db_cursor (:class:`MySQLdb.cursors.Cursor`): cursor of a SQL database in which there is a papers table
papers_table (string): name of the papers table in the SQL database
tuple of tuples: the results of the SQL query
citations = []
for p in bad_papers:
for c in p['citations']:
citations.append([p['index'], c])
citations_df = pd.DataFrame(citations, columns=['citing', 'cited'])
cited = citations_df['cited'].unique()
db_cursor.execute("SELECT id, title, abstract FROM papers p WHERE p.abstract != '' AND IN (" + ','.join(["%s"] * len(cited)) + ")", tuple(cited))
return db_cursor.fetchall()