from IPython.display import display
import itertools
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import numpy as np
import pandas as pd
from string import punctuation
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from time import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF, PCA, SparsePCA, IncrementalPCA, FastICA, DictionaryLearning, FactorAnalysis, MiniBatchDictionaryLearning, SparseCoder, IncrementalPCA, KernelPCA, MiniBatchNMF, MiniBatchSparsePCA
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\SunilYadav\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\SunilYadav\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\SunilYadav\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package omw-1.4 to [nltk_data] C:\Users\SunilYadav\AppData\Roaming\nltk_data... [nltk_data] Package omw-1.4 is already up-to-date!
True
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None
df = pd.read_csv("./complaints.csv.zip", compression="zip")
<ipython-input-4-d13dcad04102>:2: DtypeWarning: Columns (9) have mixed types. Specify dtype option on import or set low_memory=False. df = pd.read_csv("./complaints.csv.zip", compression="zip")
df.shape
(2712104, 18)
sampled_df = df.sample(frac=0.1, random_state=1)
sampled_df.shape
(271210, 18)
df.columns
Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID'], dtype='object')
sampled_df.head()
Date received | Product | Sub-product | Issue | Sub-issue | Consumer complaint narrative | Company public response | Company | State | ZIP code | Tags | Consumer consent provided? | Submitted via | Date sent to company | Company response to consumer | Timely response? | Consumer disputed? | Complaint ID | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1041418 | 2019-12-27 | Credit reporting, credit repair services, or other personal consumer reports | Credit reporting | Incorrect information on your report | Account information incorrect | NaN | Company has responded to the consumer and the CFPB and chooses not to provide a public response | CITIBANK, N.A. | TN | 38114.0 | NaN | Consent not provided | Web | 2019-12-27 | Closed with explanation | Yes | NaN | 3478855 |
2048480 | 2015-03-19 | Mortgage | FHA mortgage | Credit decision / Underwriting | NaN | I had homesite home owners ins. It was {$2000.00} a year. I had my mortgage company pay my taxes and insurance through an escrow account. I paid {$1500.00} as a fixed price. My taxes have NOT gone up. My PMI has NOT gone up. I changed my insurance company to liberty mutual. My new yearly price is {$1000.00}. When I notified Bogman Inc. of the change, my new monthly payment only went Down {$10.00}. I have called the company 4 times. First speaking to " XXXX '' whom did not help me, then to her manager " XXXX XXXX ''. Who is telling me they have only been paying {$1000.00} a year for my previous years insurance, which is a lie. | Company believes it acted appropriately as authorized by contract or law | BOGMAN, INC | CT | 6410.0 | NaN | Consent provided | Web | 2015-03-25 | Closed with explanation | Yes | No | 1291276 |
2320600 | 2021-10-02 | Credit reporting, credit repair services, or other personal consumer reports | Credit reporting | Incorrect information on your report | Information belongs to someone else | NaN | NaN | EQUIFAX, INC. | FL | 33020.0 | NaN | Consent not provided | Web | 2021-10-02 | Closed with explanation | Yes | NaN | 4772701 |
2539072 | 2022-02-17 | Debt collection | I do not know | Attempts to collect debt not owed | Debt is not yours | Hi, my name is XXXX XXXX and u would like to dispute a inquiry on my credit that is incorrect and i believe it should be removed contact me at XXXX or by email at XXXX XXXX XXXX You may contact them at XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX , GA XXXX, or visit XXXX XXXX XXXX \n\nKnow your rights A Summary of Your Rights Under the Fair Credit Reporting Act The federal Fair Credit Reporting Act ( FCRA ) promotes the accuracy, fairness, and privacy of information in the files of consumer reporting agencies. There are many types of consumer reporting agencies, including credit bureaus and specialty agencies ( such as agencies that sell information about check writing histories, medical records, and rental history records ). Here is a summary of your major rights under the FCRA. For more information, including information about additional rights, go to www.consumerfinance.gov/learnmore or write to : Consumer Financial Protection Bureau, 1700 G Street N.W., Washington, D.C. 20552. Continue Reading En Espaol Para informacin en espaol, visite www.consumerfinance.gov/learnmore o escribe a la Consumer Financial Protection Bureau, 1700 G Street N.W., Washington, D.C. 20552.\n\nServices Overview Reports & Scores Personal FinancesBETA Financial Profile Protection Credit Cards Loans Auto Tools Support Disputes Guide FAQs Glossary Education Get the free Experian app : Follow us : About Us Contact Us Terms & Conditions Privacy Policy XXXX Experian. All rights reserved. \nExperian and the Experian trademarks used herein are trademarks or registered trademarks of Experian Information Solutions , Inc., ConsumerInfo.com XXXX IncXXXX or its affiliates. Other product or company names mentioned herein are the property of their respective owners. Licenses and Disclosures. \n\nHard inquiries XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX Soft inquiries Hard inquiries XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX Soft inquiries | Company has responded to the consumer and the CFPB and chooses not to provide a public response | Experian Information Solutions Inc. | NV | 89120.0 | NaN | Consent provided | Web | 2022-02-17 | Closed with explanation | Yes | NaN | 5234095 |
169976 | 2022-02-15 | Money transfer, virtual currency, or money service | Money order | Lost or stolen money order | NaN | NaN | NaN | Redstone Funding, LLC | AL | 35673 | NaN | Consent not provided | Web | 2022-02-15 | Untimely response | No | NaN | 5226465 |
sampled_df["Products"] = sampled_df["Product"] + "~~" + sampled_df["Sub-product"]
sampled_df["Issues"] = sampled_df["Issue"] + "~~" + sampled_df["Sub-issue"]
sampled_df.columns
Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID', 'Products', 'Issues'], dtype='object')
analysis_df = sampled_df[["Products", "Issues", "Consumer complaint narrative", "Company", "ZIP code", "Tags"]]
analysis_df.head()
Products | Issues | Consumer complaint narrative | Company | ZIP code | Tags | |
---|---|---|---|---|---|---|
1041418 | Credit reporting, credit repair services, or other personal consumer reports~~Credit reporting | Incorrect information on your report~~Account information incorrect | NaN | CITIBANK, N.A. | 38114.0 | NaN |
2048480 | Mortgage~~FHA mortgage | NaN | I had homesite home owners ins. It was {$2000.00} a year. I had my mortgage company pay my taxes and insurance through an escrow account. I paid {$1500.00} as a fixed price. My taxes have NOT gone up. My PMI has NOT gone up. I changed my insurance company to liberty mutual. My new yearly price is {$1000.00}. When I notified Bogman Inc. of the change, my new monthly payment only went Down {$10.00}. I have called the company 4 times. First speaking to " XXXX '' whom did not help me, then to her manager " XXXX XXXX ''. Who is telling me they have only been paying {$1000.00} a year for my previous years insurance, which is a lie. | BOGMAN, INC | 6410.0 | NaN |
2320600 | Credit reporting, credit repair services, or other personal consumer reports~~Credit reporting | Incorrect information on your report~~Information belongs to someone else | NaN | EQUIFAX, INC. | 33020.0 | NaN |
2539072 | Debt collection~~I do not know | Attempts to collect debt not owed~~Debt is not yours | Hi, my name is XXXX XXXX and u would like to dispute a inquiry on my credit that is incorrect and i believe it should be removed contact me at XXXX or by email at XXXX XXXX XXXX You may contact them at XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX , GA XXXX, or visit XXXX XXXX XXXX \n\nKnow your rights A Summary of Your Rights Under the Fair Credit Reporting Act The federal Fair Credit Reporting Act ( FCRA ) promotes the accuracy, fairness, and privacy of information in the files of consumer reporting agencies. There are many types of consumer reporting agencies, including credit bureaus and specialty agencies ( such as agencies that sell information about check writing histories, medical records, and rental history records ). Here is a summary of your major rights under the FCRA. For more information, including information about additional rights, go to www.consumerfinance.gov/learnmore or write to : Consumer Financial Protection Bureau, 1700 G Street N.W., Washington, D.C. 20552. Continue Reading En Espaol Para informacin en espaol, visite www.consumerfinance.gov/learnmore o escribe a la Consumer Financial Protection Bureau, 1700 G Street N.W., Washington, D.C. 20552.\n\nServices Overview Reports & Scores Personal FinancesBETA Financial Profile Protection Credit Cards Loans Auto Tools Support Disputes Guide FAQs Glossary Education Get the free Experian app : Follow us : About Us Contact Us Terms & Conditions Privacy Policy XXXX Experian. All rights reserved. \nExperian and the Experian trademarks used herein are trademarks or registered trademarks of Experian Information Solutions , Inc., ConsumerInfo.com XXXX IncXXXX or its affiliates. Other product or company names mentioned herein are the property of their respective owners. Licenses and Disclosures. \n\nHard inquiries XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX Soft inquiries Hard inquiries XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX Soft inquiries | Experian Information Solutions Inc. | 89120.0 | NaN |
169976 | Money transfer, virtual currency, or money service~~Money order | NaN | NaN | Redstone Funding, LLC | 35673 | NaN |
analysis_df.rename(columns={"Consumer complaint narrative": "complaint", "ZIP code": "Zip", "Products": "Product", "Issues": "Issue"}, inplace=True) # type: ignore
<ipython-input-14-8b7d2ffb9d8f>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy analysis_df.rename(columns={"Consumer complaint narrative": "complaint", "ZIP code": "Zip", "Products": "Product", "Issues": "Issue"}, inplace=True) # type: ignore
analysis_df.columns
Index(['Product', 'Issue', 'complaint', 'Company', 'Zip', 'Tags'], dtype='object')
analysis_df.value_counts("Product") # type: ignore
Product Credit reporting, credit repair services, or other personal consumer reports~~Credit reporting 112119 Credit card or prepaid card~~General-purpose credit card or charge card 11093 Checking or savings account~~Checking account 9754 Debt collection~~I do not know 9445 Mortgage~~Other mortgage 8603 ... Other financial service~~Foreign currency exchange 6 Vehicle loan or lease~~Title loan 4 Payday loan, title loan, or personal loan~~Pawn loan 3 Prepaid card~~Transit card 2 Virtual currency~~Domestic (US) money transfer 2 Length: 92, dtype: int64
analysis_df.value_counts("Company") # type: ignore
Company EQUIFAX, INC. 43687 TRANSUNION INTERMEDIATE HOLDINGS, INC. 34533 Experian Information Solutions Inc. 32268 BANK OF AMERICA, NATIONAL ASSOCIATION 11351 WELLS FARGO & COMPANY 9864 ... ROCKY MOUNTAIN MORTGAGE COMPANY 1 Franklin Financial Corporation 1 Franks Adjustment Bureau, Inc. 1 RNN Group, Inc 1 snw investments 1 Length: 3634, dtype: int64
analysis_df.dropna(subset=["complaint"], inplace=True) # type: ignore
<ipython-input-18-d65b03af9762>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy analysis_df.dropna(subset=["complaint"], inplace=True) # type: ignore
analysis_df.shape
(95341, 6)
punctuation = list(punctuation)
stop_words = stopwords.words('english')
porter = PorterStemmer()
lancaster = LancasterStemmer()
englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
wordnet_lemmatizer = WordNetLemmatizer()
def data_cleaner(complaint_text: str):
complaint_text = complaint_text.lower()
complaint_text = complaint_text.replace("\n", " ")
complaint_text = complaint_text.replace("\r", " ")
complaint_text = complaint_text.replace("\t", " ")
complaint_text = complaint_text.replace(".", "")
complaint_text = complaint_text.replace("\xa0", " ")
complaint_text = complaint_text.replace("xx/xx/xxxx", " ")
complaint_text = complaint_text.replace("xxxx", " ")
complaint_text = complaint_text.replace("xx", " ")
tokens = word_tokenize(complaint_text)
complaint_text = " ".join([token for token in tokens if token not in stop_words and token not in punctuation and not token.isdigit() and len(token) > 3])
re.sub(r"\d+", "", complaint_text)
# re.sub(r"\d+", "", complaint_text)
# re.sub(r"\{\$\d+\.\d+\}", "", complaint_text)
re.sub(r"[^A-Za-z]+", "", complaint_text)
complaint_text = " ".join([wordnet_lemmatizer.lemmatize(word) for word in complaint_text.split()])
return complaint_text
'''
target_text = data_cleaner(analysis_df["complaint"].iloc[1])
t0 = time()
for _ in range(10000):
" ".join([wordnet_lemmatizer.lemmatize(word) for word in rget_text.split()])
print("Lemmatization took %.2f seconds" % (time() - t0))
t0 = time()
for _ in range(10000):
" ".join([englishStemmer.stem(word) for word in target_text.split()
print("English Stemming took %.2f seconds" % (time() - t0))
t0 = time()
for _ in range(10000):
" ".join([porter.stem(word) for word in target_text.split()])
print("Porter Stemming took %.2f seconds" % (time() - t0))
t0 = time()
for _ in range(10000):
" ".join([lancaster.stem(word) for word in target_text.split()])
print("Lancaster Stemming took %.2f seconds" % (time() - t0))
'''
'\n target_text = data_cleaner(analysis_df["complaint"].iloc[1])\n t0 = time()\n for _ in range(10000):\n " ".join([wordnet_lemmatizer.lemmatize(word) for word in rget_text.split()])\n print("Lemmatization took %.2f seconds" % (time() - t0))\n t0 = time()\n for _ in range(10000):\n " ".join([englishStemmer.stem(word) for word in target_text.split()\n print("English Stemming took %.2f seconds" % (time() - t0))\n t0 = time()\n for _ in range(10000):\n " ".join([porter.stem(word) for word in target_text.split()])\n print("Porter Stemming took %.2f seconds" % (time() - t0))\n t0 = time()\n for _ in range(10000):\n " ".join([lancaster.stem(word) for word in target_text.split()])\n print("Lancaster Stemming took %.2f seconds" % (time() - t0))\n'
analysis_df["complaint"] = analysis_df["complaint"].apply(data_cleaner)
<ipython-input-23-bfc8721173c5>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy analysis_df["complaint"] = analysis_df["complaint"].apply(data_cleaner)
analysis_df.head()
Product | Issue | complaint | Company | Zip | Tags | |
---|---|---|---|---|---|---|
2048480 | Mortgage~~FHA mortgage | NaN | homesite home owner year mortgage company tax insurance escrow account paid fixed price tax gone gone changed insurance company liberty mutual yearly price notified bogman change monthly payment went called company time first speaking help manager telling paying year previous year insurance | BOGMAN, INC | 6410.0 | NaN |
2539072 | Debt collection~~I do not know | Attempts to collect debt not owed~~Debt is not yours | name would like dispute inquiry credit incorrect believe removed contact email contact visit know right summary right fair credit reporting federal fair credit reporting fcra promotes accuracy fairness privacy information file consumer reporting agency many type consumer reporting agency including credit bureau specialty agency agency sell information check writing history medical record rental history record summary major right fcra information including information additional right wwwconsumerfinancegov/learnmore write consumer financial protection bureau street washington continue reading espaol para informacin espaol visite wwwconsumerfinancegov/learnmore escribe consumer financial protection bureau street washington service overview report score personal financesbeta financial profile protection credit card loan auto tool support dispute guide faq glossary education free experian follow contact term condition privacy policy experian right reserved experian experian trademark used herein trademark registered trademark experian information solution consumerinfocom affiliate product company name mentioned herein property respective owner license disclosure hard inquiry inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record soft inquiry hard inquiry inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record soft inquiry | Experian Information Solutions Inc. | 89120.0 | NaN |
1538365 | Credit card or prepaid card~~General-purpose credit card or charge card | Other features, terms, or problems~~Problem with rewards from credit card | approved amex gold delta skymiles gold amex platinum delta skymiles platinum card gold card point bonus spending platinum card point bonus spending within first month term condition stated bonus would awarded within week hitting spend threshold bonus threshold gold card platinum card called american express couple week bonus threshold told bonus post automatically week date threshold call back point would manually applied account immediately week later bonus posted account called american express spoke manager named operator told awarded bonus total point manually posting account would point within 24-72 hour point never posted called american express back told point never posted even though told would post within 24-72 hour manager spoke said nothing could would wait point would post requested coordinate call back point post even though told would within 24-72 hour said would reach would call back within business day never called called american express spoke supervisor requesting point manually awarded supervisor acknowledged awarded point said nothing could supervisor also told need patient wait point post nothing additional could asked never called back said know would another request call assured time would call within business day also putting separate request someone higher american express would calling within day well discus issue offer resolution today still received bonus point either card heard absolutely nothing anyone else american express three month since spend threshold month since told bonus point would manually added account regular representative supervisor american express refusing assist offer explanation essentially told jump lake even though acknowledged received bonus american express action dishonest unethical expressly breached term card agreement entered applied card spent hour phone american express trying resolve calm peaceful manner avail respectfully request cfpb reach american express behalf request correct injustice award bonus point deserve thank | AMERICAN EXPRESS COMPANY | 11367.0 | NaN |
2056487 | Mortgage~~Conventional fixed mortgage | NaN | bank america denying loan modification year | BANK OF AMERICA, NATIONAL ASSOCIATION | 95304.0 | Older American |
2423207 | Credit reporting, credit repair services, or other personal consumer reports~~Credit reporting | Incorrect information on your report~~Old information reappears or never goes away | account violating fcra right deleting account year still reporting equifax beyond statute limitation reporting | EQUIFAX, INC. | 46804 | NaN |
word_counts = analysis_df["complaint"].str.split().apply(lambda x: len(x))
pd.Series(analysis_df["complaint"].str.split().apply(lambda x: len(x))).describe()
count 95341.000000 mean 77.247082 std 94.253957 min 0.000000 25% 26.000000 50% 50.000000 75% 93.000000 max 2574.000000 Name: complaint, dtype: float64
analysis_df[analysis_df["complaint"].str.split().apply(lambda x: len(x)) < 10]["complaint"]
2056487 bank america denying loan modification year 2622762 account closed debt paid full account show still owed 2697336 capital inquires date remember others 2315623 unauthorized inquiry credit report must removed soon possible 2014006 money date receiving medicaid medicaid paid ... 2163112 427550 creditor contacted knowledge debt contacted attorney debt 1983098 specialized loan service repeatedly denies application mortgage assistance 1070682 inquiry credit report recognize 1882059 information transferred another company without consent Name: complaint, Length: 4903, dtype: object
analysis_df = analysis_df[analysis_df["complaint"].str.split().apply(lambda x: len(x)) > 10]
pd.Series(analysis_df["complaint"].str.split().apply(lambda x: len(x))).describe()
count 89478.000000 mean 81.857138 std 95.498488 min 11.000000 25% 30.000000 50% 54.000000 75% 98.000000 max 2574.000000 Name: complaint, dtype: float64
n_features = 1000
n_components = 20
n_top_words = 20
batch_size = 128
init = "nndsvda"
random_state = 42
num_words = 30
count_vectorizer = CountVectorizer(stop_words=stop_words, max_df=0.95, min_df=0.01, max_features=n_features, ngram_range=(1, 2))
tf_idf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.95, min_df=0.01, max_features=n_features, ngram_range=(1, 2))
X_train, X_test = train_test_split(analysis_df, test_size=0.2, random_state=1)
n_samples = X_train.shape[0]
X_train_count_vectors = count_vectorizer.fit_transform(X_train["complaint"])
X_test_count_vectors = count_vectorizer.fit(X_test["complaint"])
X_train_tfidf_vectors = tf_idf_vectorizer.fit_transform(X_train["complaint"])
X_test_tfidf_vectors = tf_idf_vectorizer.fit(X_test["complaint"])
inits = ['random', 'nndsvd', 'nndsvda', 'nndsvdar']
beta_losses = ['frobenius', 'kullback-leibler', 'itakura-saito']
solvers = ['cd', 'mu']
losses = [0, 1]
# inits = ['random', 'nndsvd', 'nndsvda', 'nndsvdar']
# beta_losses = ['frobenius', 'kullback-leibler', 'itakura-saito']
# solvers = ['cd', 'mu']
# losses = [0, 1]
# Fit the NMF model
print("Fitting the NMF model with tf-idf features n_samples=%d and n_features=%d..." % (n_samples, n_features))
# nmf_results = []
# for init, beta_loss, solver, loss in itertools.product(inits, beta_losses, solvers, losses):
# if solver == 'cd' and beta_loss in ['kullback-leibler', 'itakura-saito']:
# continue
# print(f"Fitting the NMF model for {beta_loss}")
# print(f"Parameters: init={init}, beta_loss={beta_loss}, solver={solver}, loss={loss}")
t0 = time()
nmf = NMF(
n_components=n_components,
random_state=1,
init=init,
beta_loss="frobenius",
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=1,
).fit(X_train_tfidf_vectors)
# nmf_results.append((init, beta_loss, solver, loss, W))
print("done in %0.3fs." % (time() - t0))
Fitting the NMF model with tf-idf features n_samples=71582 and n_features=1000... done in 9.031s.
W = nmf.transform(X_train_tfidf_vectors)
H = nmf.components_
print(W.shape, H.shape)
(71582, 20) (20, 1000)
nmf.get_feature_names_out()
array(['nmf0', 'nmf1', 'nmf2', 'nmf3', 'nmf4', 'nmf5', 'nmf6', 'nmf7', 'nmf8', 'nmf9', 'nmf10', 'nmf11', 'nmf12', 'nmf13', 'nmf14', 'nmf15', 'nmf16', 'nmf17', 'nmf18', 'nmf19'], dtype=object)
vocab = np.array(tf_idf_vectorizer.get_feature_names_out())
def top_words(t, num_words):
return [vocab[i] for i in np.argsort(t)[: -num_words - 1: -1]]
topic_words = [top_words(t, num_words) for t in H]
topics = [", ".join(t) for t in topic_words]
display(topics[:3])
['repeatedly, credit, credit report, remove, scam, remaining, investigated, remove account, credit score, place, account deleted, negatively, mentioned, nothing, plan, collect debt, national, still, need, limit, year, investigation, informed, tried, including, report, show, reply, public record, contacting', 'would, business day, told, call back, time, aware, right, personal, missed payment, notified, county, receipt, sent, approximately, negatively, email, mistake, charged, year, even though, heard, customer service, similar, almost, best, interest rate, item credit, still, contacted, always', 'debt collection, collect debt, code, collection practice, collection account, debt collector, value, deceptive, agreed, affidavit, order, agreement, collect, creditor, conversation, promptly, outstanding, original, post, validation, always, many, record, fdcpa, collection company, protection bureau, fair debt, note, best, civil']
np.argmax(W)
125406
np.argmax(W, axis=1)
array([14, 10, 1, ..., 16, 11, 3], dtype=int64)
print(
"\n" * 2,
"Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
"features, n_samples=%d and n_features=%d, batch_size=%d..."
% (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
n_components=n_components,
random_state=1,
batch_size=batch_size,
init=init,
beta_loss="frobenius",
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=0.5,
).fit(X_train_tfidf_vectors)
print("done in %0.3fs." % (time() - t0))
Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf features, n_samples=71582 and n_features=1000, batch_size=128... done in 6.286s.
Wmb = mbnmf.transform(X_train_tfidf_vectors)
Hmb = mbnmf.components_
print(Wmb.shape, Hmb.shape)
(71582, 20) (20, 1000)
print(
"\n" * 2,
"Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
% (n_samples, n_features),
)
lda = LatentDirichletAllocation(
n_components=n_components,
max_iter=5,
learning_method="online",
learning_offset=50.0,
random_state=0,
)
t0 = time()
lda.fit(X_train_count_vectors.toarray())
print("done in %0.3fs." % (time() - t0))
Fitting LDA models with tf features, n_samples=71582 and n_features=1000... done in 111.890s.
Wlda = lda.transform(X_train_count_vectors.toarray())
Hlda = lda.components_
print(Wlda.shape, Hlda.shape)
(71582, 20) (20, 1000)
count_feature_names = count_vectorizer.get_feature_names_out()
topic_words = [top_words(t, num_words) for t in Hlda]
topics = [", ".join(t) for t in topic_words]
display(topics[:3])
['credit, repeatedly, credit report, initial, branch, credit bureau, remove, account account, reported, info, remaining, experience, company, tried, report credit, file, place, negatively, remove account, still, happen, national, contacting, dispute, credit file, including, time, need, would, three credit', 'legal, sell, receipt, dispute, respond, info, reporting inaccurate, lost, statement, copy credit, representative, document, complaint, day late, 2021, cell, associated, security number, received call, day, 2018, send, promptly, additionally, prove, submitted, still, reasonable, almost, insurance company', 'civil, agreed, reported, debt collection, agreement, collect debt, compliance, promptly, document, free, except, even though, complaint, fact, spent, stated, review, post, credit, account account, complete, keep, regulation, derogatory, much, every, cover, regarding, timely, mark']
lda.perplexity(X_train_count_vectors.toarray())
378.04097379660016