Source code for string_tools

import numpy as np
import nltk
import re


[docs]class WordHasher(object): """Provides tools to transform a string into a bag-of-ngrams vector. Args: n (int): dimension of n-gram bord (string): delimiter character to surround words with """ def __init__(self, n=3, bord='#'): self.n_ = n self.bord_ = bord self.ngrams = []
[docs] def init_ngrams(self, tokens): """Computes the ngrams from a list of words and affects them to ``self.ngrams``. Args: tokens (list of strings): list of words from which compute the ngrams """ ngrams = set() for t in tokens: # Surround each token with `bord` char t = self.bord_ + t + self.bord_ # Add the ngrams in the token for i in range(len(t) - self.n_): ngrams.add(t[i:i+self.n_]) self.ngrams = list(sorted(ngrams))
[docs] def load_ngrams(self, ngrams_): """Loads a list of ngrams into ``self.ngrams``. Args: ngrams_ (list of strings): the list of ngrams to load """ if len(ngrams_[0]) == self.n_: self.ngrams = ngrams_ else: raise ValueError("Incompatible ngram sizes (n = %d expected, but got %d)" % (self.n_, len(ngrams_[0])))
[docs] def print_ngrams(self): """Prints the list of ngrams. """ print(self.ngrams)
[docs] def hash(self, s): """Transforms a string into a n-gram count representation. Args: s (string): the string to hash Returns: :class:`np.ndarray`: a n-gram count representation of the string given in input. """ # init counts counts = {} for g in self.ngrams: counts[g] = 0 # clean string s = StringCleaner().clean_string(s) # tokenize string pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?" sl = set(nltk.regexp_tokenize(s, pattern)) - set(nltk.corpus.stopwords.words("english")) num_re = re.compile("^\d+$") sl = set([t for t in sl if not num_re.match(t)]) # we remove only-numeric tokens # stem tokens porter = nltk.stem.PorterStemmer() sl = [porter.stem(t) for t in sl] # we assume here that the string is clean # add bord char around each word for a in sl: a = self.bord_ + a + self.bord_ # hash words and increment counts for a in sl: for i in range(len(a) - self.n_): ngram = a[i:i+self.n_] if ngram in counts.keys(): counts[ngram] += 1 # convert result into ndarray res = np.zeros(len(self.ngrams)) for j in range(len(self.ngrams)): res[j] = counts[self.ngrams[j]] return res;
[docs]class StringHasher(object): """Provides tools to transform a sentence into a bag-of-words vector. Args: n (int): the dimension of n-gram """ def __init__(self, n=1): self.n_ = n self.ngrams = []
[docs] def init_ngrams(self, tokens): """Computes the ngrams from a list of words and affects them to ``self.ngrams``. Todo: deal with the case n != 1 Args: tokens (list of strings): list of words from which compute the n-grams """ self.ngrams = list(sorted(set(tokens)))
[docs] def load_ngrams(self, ngrams_): """Loads a list of ngrams into ``self.ngrams``. Args: ngrams_ (list of strings): the list of n-grams to load """ if len(ngrams_[0].split(' ')) == self.n_: self.ngrams = ngrams_ else: raise ValueError("Incompatible ngram sizes (n = %d expected, but got %d)" % (self.n_, len(ngrams_[0])))
[docs] def print_ngrams(self): """Prints the list of ngrams. """ print(self.ngrams)
[docs] def hash(self, s): """Transforms a string into a n-gram count representation. Args: s (string): the string to hash Returns: :class:`np.ndarray`: n-gram count representation of the string given in input. """ # init counts counts = {} for g in self.ngrams: counts[g] = 0 # clean string s = StringCleaner().clean_string(s) # tokenize string (we keep words, with optional hyphens, acronyms and numbers, with optional percentage symbol) pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?" sl = set(nltk.regexp_tokenize(s, pattern)) - set(nltk.corpus.stopwords.words("english")) num_re = re.compile("^\d+$") sl = set([t for t in sl if not num_re.match(t)]) # we remove only-numeric tokens (for example: years, ...) # stem tokens porter = nltk.stem.PorterStemmer() sl = [porter.stem(t) for t in sl] # we assume here that the string is clean # increment counts for a in sl: if a in counts.keys(): counts[a] += 1 # convert result into ndarray res = np.zeros(len(self.ngrams)) for j in range(len(self.ngrams)): res[j] = counts[self.ngrams[j]] return res;
[docs]class StringCleaner(object): """Provides tools to clean strings, like accents removal and standardisation. """ def __init__(self): self.diacriticLetters_ = "àáâãāăȧäảåǎȁȃąạḁẚæǽǣḅḇƀćĉċčƈçḉȼḋɗḍḏḑḓďđðèéêẽēĕėëẻěȅȇẹȩęḙḛǵĝḡğġǧɠģǥĥḣḧȟḥḩḫẖħìíîĩīĭïǐịįȉȋḭɨijĵǰḱǩḵƙḳķĺḻḷļḽľŀłƚḹµḿṁṃɱɯǹńñṅňŋɲṇņṋṉʼnƞòóôõōŏȯöỏőǒȍȏơǫọøǿœṕṗŕṙřȑȓṛŗṟśŝṡšṣșşẗťṭțţṱṯùúûũūŭüủůǔȗụṳųṷṵṽṿẁẃŵẇẅẘẉẋẍỳýŷȳẏÿỷẙźẑżžȥẓẕƶß" self.noDiacriticLetters_ = "" self.noDiacriticLetters_ += "a"+"a"+"a"+"a"+"a"+"a"+"a"+"a"+"a"+"a"+"a"+"a"+"a"+"a"+"a"+"a"+"a" self.noDiacriticLetters_ += "ae"+"ae"+"ae" self.noDiacriticLetters_ += "b"+"b"+"b" self.noDiacriticLetters_ += "c"+"c"+"c"+"c"+"c"+"c"+"c"+"c" self.noDiacriticLetters_ += "d"+"d"+"d"+"d"+"d"+"d"+"d"+"d"+"d" self.noDiacriticLetters_ += "e"+"e"+"e"+"e"+"e"+"e"+"e"+"e"+"e"+"e"+"e"+"e"+"e"+"e"+"e"+"e"+"e" self.noDiacriticLetters_ += "g"+"g"+"g"+"g"+"g"+"g"+"g"+"g"+"g" self.noDiacriticLetters_ += "h"+"h"+"h"+"h"+"h"+"h"+"h"+"h"+"h" self.noDiacriticLetters_ += "i"+"i"+"i"+"i"+"i"+"i"+"i"+"i"+"i"+"i"+"i"+"i"+"i"+"i" self.noDiacriticLetters_ += "ij" self.noDiacriticLetters_ += "j"+"j" self.noDiacriticLetters_ += "k"+"k"+"k"+"k"+"k"+"k" self.noDiacriticLetters_ += "l"+"l"+"l"+"l"+"l"+"l"+"l"+"l"+"l"+"l" self.noDiacriticLetters_ += "m"+"m"+"m"+"m"+"m"+"m" self.noDiacriticLetters_ += "n"+"n"+"n"+"n"+"n"+"n"+"n"+"n"+"n"+"n"+"n"+"n"+"n" self.noDiacriticLetters_ += "o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o"+"o" self.noDiacriticLetters_ += "oe" self.noDiacriticLetters_ += "p"+"p" self.noDiacriticLetters_ += "r"+"r"+"r"+"r"+"r"+"r"+"r"+"r" self.noDiacriticLetters_ += "s"+"s"+"s"+"s"+"s"+"s"+"s" self.noDiacriticLetters_ += "t"+"t"+"t"+"t"+"t"+"t"+"t" self.noDiacriticLetters_ += "u"+"u"+"u"+"u"+"u"+"u"+"u"+"u"+"u"+"u"+"u"+"u"+"u"+"u"+"u"+"u" self.noDiacriticLetters_ += "v"+"v" self.noDiacriticLetters_ += "w"+"w"+"w"+"w"+"w"+"w"+"w" self.noDiacriticLetters_ += "x"+"x" self.noDiacriticLetters_ += "y"+"y"+"y"+"y"+"y"+"y"+"y"+"y" self.noDiacriticLetters_ += "z"+"z"+"z"+"z"+"z"+"z"+"z"+"z" self.noDiacriticLetters_ += "s"
[docs] def remove_accents(self, s): """Replaces all accentuated characters by their non-accentuated equivalent. Args: s (string): the string to transform Returns: string: the deburred string """ output = "" for c in s: try: dIndex = self.diacriticLetters_.index(c) output += self.noDiacriticLetters_[dIndex] except ValueError: output += c return output
[docs] def clean_string(self, s, bord=''): """Apply cleaning operations to a string, especially accents removal. Args: s (string): the string to clean bord (string): an optional character to surround the words with Returns: string: the cleaned string """ # lowercase only s = str.lower(s) # remove accents s = self.remove_accents(s) # remove bord char s = s.replace(bord, '') # replace \ by space (LaTeX-like syntax) s = s.replace('\\', ' ') return s