Source code for textpp_ptbr.preprocessing
"""
:Authors:
Fernando Sola Pereira
"""
import re
import os
import unicodedata
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
[docs]class TextPreProcessing:
"""Collection of static methods used to perform common text cleanup tasks focused on portuguese language.
This class use dictionaries and regular expressions to expose a set of features to help
process Portuguese texts.
"""
__re_hour_pattern = re.compile(r'(^|\b)(\d)+(\s)*(h|hr|hrs|hs)($|\b)', re.IGNORECASE)
__re_common_person_names = None
__re_stopwords = None
__re_reduced_or_contracted_words = None
__re_numbers_in_full = None
__re_pronouns = None
__re_adverbs = None
__re_remove_excessive_spaces = re.compile(' +')
__re_numbers_with_symbols = re.compile(r'([\d]+)([./-])*([\d ])')
__re_pure_numbers = re.compile(r'(^|\b)(\d+)(\b|$)')
__re_urls = re.compile(r'[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?')
@classmethod
def __get_dicionary(cls, dict_name):
path = os.path.join(BASE_DIR, 'dictionaries', dict_name)
with open(path, 'r', encoding='utf-8') as dictionary:
return [p.replace('\n', '') for p in dictionary]
[docs] @classmethod
def get_stopwords(cls):
"""Returns a list of brazilian portuguese stopwords.
All stopwords were extracted from NLTK.
"""
return cls.__get_dicionary('stopwords.dic')
[docs] @classmethod
def remove_hour(cls, text):
"""Remove hour patterns from texts.
.. code-block::
In [ ]: from textpp_ptbr.preprocessing import TextPreProcessing as tpp
...: tpp.remove_hour('some text with 12h or another 13hs time explicit')
Out[ ]: 'some text with or another time explicit'
"""
return cls.__re_hour_pattern.sub(' ', text)
[docs] @classmethod
def remove_person_names(cls, text):
"""Remove common person names.
All accents are removed before identify names.
This method uses a dictionary with brazilian common names to build a regular
expression that match common names.
.. code-block::
In [ ]: from textpp_ptbr.preprocessing import TextPreProcessing as tpp
...: tpp.remove_person_names('Afirma o réu que seu funcionário Mário Tadeu dirigia o veículo na ocasião.')
Out[ ]: 'Afirma o reu que seu funcionario dirigia o veiculo na ocasiao.'
"""
text = cls.remove_accents(text)
if not cls.__re_common_person_names:
dictionary = cls.__get_dicionary('common_person_names.dic')
dictionary = [cls.remove_accents(p) for p in dictionary]
cls.__re_common_person_names = re.compile(r'(^|\b)(' + r'|'.join(dictionary) + r')($|\b)')
return cls.__re_common_person_names.sub(' ', text)
[docs] @classmethod
def remove_pronouns(cls, text):
"""Remove pronouns.
Method based on a dictionary.
.. code-block::
In [ ]: from textpp_ptbr.preprocessing import TextPreProcessing as tpp
...: tpp.remove_pronouns('Ninguém sabe ao certo donde partiram os gritos.')
Out[ ]: 'Ninguém sabe certo partiram os gritos.'
"""
if not cls.__re_pronouns:
palavras = cls.__get_dicionary('pronouns.dic')
cls.__re_pronouns = re.compile(r'(^|\b)(' + r'|'.join(palavras) + r')($|\b)', re.IGNORECASE)
return cls.__re_pronouns.sub(' ', text)
[docs] @classmethod
def remove_reduced_or_contracted_words(cls, text):
"""Remove reduced or crontracted words.
Method based on a dictionary.
.. code-block::
In [ ]: from textpp_ptbr.preprocessing import TextPreProcessing as tpp
...: tpp.remove_pronouns('Ninguém sabe ao certo donde partiram os gritos.')
Out[ ]: 'Ninguém sabe certo partiram os gritos.'
"""
if not cls.__re_reduced_or_contracted_words:
palavras = TextPreProcessing.__get_dicionary('contracted_words.dic')
cls.__re_reduced_or_contracted_words = re.compile(r'(^|\b)(' + r'|'.join(palavras) + r')($|\b)')
return cls.__re_reduced_or_contracted_words.sub(' ', text)
[docs] @classmethod
def remove_adverbs(cls, text):
"""Remove reduced or crontracted words.
Method based on a dictionary.
.. code-block::
In [ ]: from textpp_ptbr.preprocessing import TextPreProcessing as tpp
...: tpp.remove_pronouns('Chegaram tarde para o Jantar. Era a moça mais bonita da festa. Partiram ontem apressadamente.')
Out[ ]: 'Chegaram para o Jantar. Era a moça bonita da festa. Partiram .'
"""
if not cls.__re_adverbs:
palavras = cls.__get_dicionary('adverbs.dic')
cls.__re_adverbs = re.compile(r'(^|\b)(' + r'|'.join(palavras) + r')($|\b)')
return cls.__re_adverbs.sub(' ', text)
@staticmethod
def remove_special_characters(text):
lista = '-#?º°ª.:/;~^`[{]}\\|!$%"\'&*()=+,><\t\r\n…'
result = text
for i in range(0, len(lista)):
result = result.replace(lista[i], ' ')
return result
@classmethod
def remove_excessive_spaces(cls, texto):
if texto is None or len(texto.strip()) == 0:
# return texto
return re.sub(' +', ' ', texto)
return cls.__re_remove_excessive_spaces.sub(' ', texto)
@staticmethod
def remove_accents(text):
if text is None or len(text.strip()) == 0:
return text
result = text
result = unicodedata.normalize('NFKD', result).encode(
'ASCII', 'ignore').decode('ASCII')
return result
@classmethod
def remove_symbols_from_numbers(cls, text):
resultado = text
resultado = cls.__re_numbers_with_symbols.sub(r'\1\3', resultado)
return resultado
@classmethod
def remove_numbers(cls, text):
return cls.__re_pure_numbers.sub(r' ', text)
@classmethod
def remove_numbers_in_full(cls, text):
if not TextPreProcessing.__re_numbers_in_full:
palavras = cls.__get_dicionary('numbers_in_full.dic')
cls.__re_numbers_in_full = re.compile(r'(^|\b)(' + r'|'.join(palavras) + r')($|\b)')
return cls.__re_numbers_in_full.sub(' ', text)
@classmethod
def remove_urls(cls, text):
resultado = text
resultado = cls.__re_urls.sub(r' ', resultado)
return resultado
@classmethod
def remove_stopwords(cls, texto):
if not cls.__re_stopwords:
stopwords = cls.get_stopwords()
cls.__re_stopwords = re.compile(r'(^|\b)(' + r'|'.join(stopwords) + r')($|\b)')
return cls.__re_stopwords.sub(' ', texto)