Source code for pysentiment2.utils

"""
This module contains methods to tokenize sentences.
"""
import abc
import re
import nltk


[docs]class BaseTokenizer(object, metaclass=abc.ABCMeta): """ An abstract class for tokenize text. """
[docs] @abc.abstractmethod def tokenize(self, text): """Return tokenized temrs. :type text: str :returns: list """ pass
[docs]class Tokenizer(BaseTokenizer): """ The default tokenizer for ``pysentiment2``, which only takes care of words made up of ``[a-z]+``. The output of the tokenizer is stemmed by ``nltk.PorterStemmer``. The stoplist from https://www3.nd.edu/~mcdonald/Word_Lists.html is included in this tokenizer. Any word in the stoplist will be excluded from the output. """
[docs] def __init__(self): self._stemmer = nltk.PorterStemmer() self._stopset = self.get_stopset()
[docs] def tokenize(self, text): tokens = [] for t in nltk.regexp_tokenize(text.lower(), '[a-z]+'): t = self._stemmer.stem(t) if not t in self._stopset: tokens.append(t) return tokens
[docs] def get_stopset(self): from pysentiment2.base import STATIC_PATH files = ['Currencies.txt', 'DatesandNumbers.txt', 'Generic.txt', 'Geographic.txt', 'Names.txt'] stopset = set() for f in files: fin = open('%s/%s'%(STATIC_PATH, f), 'rb') for line in fin.readlines(): line = line.decode(encoding='latin-1') match = re.search('(\w+)', line) if match == None: continue word = match.group(1) stopset.add(self._stemmer.stem(word.lower())) fin.close() return stopset