import pandas as pd import argparse import numpy as np import re # (https://docs.python.org/3/library/re.html) for tokenising textual data import string # (https://docs.python.org/3/library/string.html) for string operations # Creating the random instance rng = np.random.default_rng() class TextPreprocess: """Text Preprocessing for a Natural Language Processing model.""" def cleantext(self, df, text_column, remove_stopwords = True, remove_punc = True): """Function to clean text data by removing stopwords, tags and punctuation. Parameters ---------- df : pandas dataframe The dataframe housing the input data. text_column : str Column in dataframe whose text is to be cleaned. remove_stopwords : bool if True, remove stopwords from text remove_punc : bool if True, remove punctuation suymbols from text Returns ------- Numpy array Cleaned text. """ data = df # converting all characters to lowercase data[text_column] = data[text_column].str.lower() # List of common stopwords taken from https://gist.github.com/sebleier/554280 stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ] def remove_stopwords(data, column): data[f'{column} without stopwords'] = data[column].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)])) return data def remove_tags(string): result = re.sub('<*>','',string) return result # remove html tags and brackets from text if remove_stopwords: data_without_stopwords = remove_stopwords(data, text_column) data_without_stopwords[f'clean_{text_column}']= data_without_stopwords[f'{text_column} without stopwords'].apply(lambda cw : remove_tags(cw)) if remove_punc: data_without_stopwords[f'clean_{text_column}'] = data_without_stopwords[f'clean_{text_column}'].str.replace('[{}]'.format(string.punctuation), ' ', regex = True) X = data_without_stopwords[f'clean_{text_column}'].to_numpy() return X def split_data (self, X, y, split_percentile): """Function to split data into training and testing data. Parameters ---------- X : Numpy Array Contains textual data. y : Numpy Array Contains target data. split_percentile : int Proportion of training to testing data. Returns ------- Tuple Contains numpy arrays of test and training data. """ y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y))) arr_rand = rng.random(X.shape[0]) split = arr_rand < np.percentile(arr_rand, split_percentile) X_train = X[split] y_train = y[split] X_test = X[~split] y_test = y[~split] return (X_train, y_train, X_test, y_test) def sent_tokeniser (self, x): """Function to split text into sentences. Parameters ---------- x : str piece of text Returns ------- list sentences with punctuation removed. """ sentences = re.split(r'(?