In [ ]:
from IPython.display import display
import itertools
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import numpy as np
import pandas as pd
from string import punctuation
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from time import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF, PCA, SparsePCA, IncrementalPCA, FastICA, DictionaryLearning, FactorAnalysis, MiniBatchDictionaryLearning, SparseCoder, IncrementalPCA, KernelPCA, MiniBatchNMF, MiniBatchSparsePCA
In [ ]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SunilYadav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SunilYadav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SunilYadav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\SunilYadav\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Out[ ]:
True
In [ ]:
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None
In [ ]:
df = pd.read_csv("./complaints.csv.zip", compression="zip")
<ipython-input-4-d13dcad04102>:2: DtypeWarning: Columns (9) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv("./complaints.csv.zip", compression="zip")
In [ ]:
df.shape
Out[ ]:
(2712104, 18)
In [ ]:
sampled_df = df.sample(frac=0.1, random_state=1)
In [ ]:
sampled_df.shape
Out[ ]:
(271210, 18)
In [ ]:
df.columns
Out[ ]:
Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')
In [ ]:
sampled_df.head()
Out[ ]:
Date received Product Sub-product Issue Sub-issue Consumer complaint narrative Company public response Company State ZIP code Tags Consumer consent provided? Submitted via Date sent to company Company response to consumer Timely response? Consumer disputed? Complaint ID
1041418 2019-12-27 Credit reporting, credit repair services, or other personal consumer reports Credit reporting Incorrect information on your report Account information incorrect NaN Company has responded to the consumer and the CFPB and chooses not to provide a public response CITIBANK, N.A. TN 38114.0 NaN Consent not provided Web 2019-12-27 Closed with explanation Yes NaN 3478855
2048480 2015-03-19 Mortgage FHA mortgage Credit decision / Underwriting NaN I had homesite home owners ins. It was {$2000.00} a year. I had my mortgage company pay my taxes and insurance through an escrow account. I paid {$1500.00} as a fixed price. My taxes have NOT gone up. My PMI has NOT gone up. I changed my insurance company to liberty mutual. My new yearly price is {$1000.00}. When I notified Bogman Inc. of the change, my new monthly payment only went Down {$10.00}. I have called the company 4 times. First speaking to " XXXX '' whom did not help me, then to her manager " XXXX XXXX ''. Who is telling me they have only been paying {$1000.00} a year for my previous years insurance, which is a lie. Company believes it acted appropriately as authorized by contract or law BOGMAN, INC CT 6410.0 NaN Consent provided Web 2015-03-25 Closed with explanation Yes No 1291276
2320600 2021-10-02 Credit reporting, credit repair services, or other personal consumer reports Credit reporting Incorrect information on your report Information belongs to someone else NaN NaN EQUIFAX, INC. FL 33020.0 NaN Consent not provided Web 2021-10-02 Closed with explanation Yes NaN 4772701
2539072 2022-02-17 Debt collection I do not know Attempts to collect debt not owed Debt is not yours Hi, my name is XXXX XXXX and u would like to dispute a inquiry on my credit that is incorrect and i believe it should be removed contact me at XXXX or by email at XXXX XXXX XXXX You may contact them at XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX , GA XXXX, or visit XXXX XXXX XXXX \n\nKnow your rights A Summary of Your Rights Under the Fair Credit Reporting Act The federal Fair Credit Reporting Act ( FCRA ) promotes the accuracy, fairness, and privacy of information in the files of consumer reporting agencies. There are many types of consumer reporting agencies, including credit bureaus and specialty agencies ( such as agencies that sell information about check writing histories, medical records, and rental history records ). Here is a summary of your major rights under the FCRA. For more information, including information about additional rights, go to www.consumerfinance.gov/learnmore or write to : Consumer Financial Protection Bureau, 1700 G Street N.W., Washington, D.C. 20552. Continue Reading En Espaol Para informacin en espaol, visite www.consumerfinance.gov/learnmore o escribe a la Consumer Financial Protection Bureau, 1700 G Street N.W., Washington, D.C. 20552.\n\nServices Overview Reports & Scores Personal FinancesBETA Financial Profile Protection Credit Cards Loans Auto Tools Support Disputes Guide FAQs Glossary Education Get the free Experian app : Follow us : About Us Contact Us Terms & Conditions Privacy Policy XXXX Experian. All rights reserved. \nExperian and the Experian trademarks used herein are trademarks or registered trademarks of Experian Information Solutions , Inc., ConsumerInfo.com XXXX IncXXXX or its affiliates. Other product or company names mentioned herein are the property of their respective owners. Licenses and Disclosures. \n\nHard inquiries XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX Soft inquiries Hard inquiries XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX Soft inquiries Company has responded to the consumer and the CFPB and chooses not to provide a public response Experian Information Solutions Inc. NV 89120.0 NaN Consent provided Web 2022-02-17 Closed with explanation Yes NaN 5234095
169976 2022-02-15 Money transfer, virtual currency, or money service Money order Lost or stolen money order NaN NaN NaN Redstone Funding, LLC AL 35673 NaN Consent not provided Web 2022-02-15 Untimely response No NaN 5226465
In [ ]:
sampled_df["Products"] = sampled_df["Product"] + "~~" + sampled_df["Sub-product"]
sampled_df["Issues"] = sampled_df["Issue"] + "~~" + sampled_df["Sub-issue"]
In [ ]:
sampled_df.columns
Out[ ]:
Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID', 'Products',
       'Issues'],
      dtype='object')
In [ ]:
analysis_df = sampled_df[["Products", "Issues", "Consumer complaint narrative", "Company", "ZIP code", "Tags"]]
In [ ]:
analysis_df.head()
Out[ ]:
Products Issues Consumer complaint narrative Company ZIP code Tags
1041418 Credit reporting, credit repair services, or other personal consumer reports~~Credit reporting Incorrect information on your report~~Account information incorrect NaN CITIBANK, N.A. 38114.0 NaN
2048480 Mortgage~~FHA mortgage NaN I had homesite home owners ins. It was {$2000.00} a year. I had my mortgage company pay my taxes and insurance through an escrow account. I paid {$1500.00} as a fixed price. My taxes have NOT gone up. My PMI has NOT gone up. I changed my insurance company to liberty mutual. My new yearly price is {$1000.00}. When I notified Bogman Inc. of the change, my new monthly payment only went Down {$10.00}. I have called the company 4 times. First speaking to " XXXX '' whom did not help me, then to her manager " XXXX XXXX ''. Who is telling me they have only been paying {$1000.00} a year for my previous years insurance, which is a lie. BOGMAN, INC 6410.0 NaN
2320600 Credit reporting, credit repair services, or other personal consumer reports~~Credit reporting Incorrect information on your report~~Information belongs to someone else NaN EQUIFAX, INC. 33020.0 NaN
2539072 Debt collection~~I do not know Attempts to collect debt not owed~~Debt is not yours Hi, my name is XXXX XXXX and u would like to dispute a inquiry on my credit that is incorrect and i believe it should be removed contact me at XXXX or by email at XXXX XXXX XXXX You may contact them at XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX , GA XXXX, or visit XXXX XXXX XXXX \n\nKnow your rights A Summary of Your Rights Under the Fair Credit Reporting Act The federal Fair Credit Reporting Act ( FCRA ) promotes the accuracy, fairness, and privacy of information in the files of consumer reporting agencies. There are many types of consumer reporting agencies, including credit bureaus and specialty agencies ( such as agencies that sell information about check writing histories, medical records, and rental history records ). Here is a summary of your major rights under the FCRA. For more information, including information about additional rights, go to www.consumerfinance.gov/learnmore or write to : Consumer Financial Protection Bureau, 1700 G Street N.W., Washington, D.C. 20552. Continue Reading En Espaol Para informacin en espaol, visite www.consumerfinance.gov/learnmore o escribe a la Consumer Financial Protection Bureau, 1700 G Street N.W., Washington, D.C. 20552.\n\nServices Overview Reports & Scores Personal FinancesBETA Financial Profile Protection Credit Cards Loans Auto Tools Support Disputes Guide FAQs Glossary Education Get the free Experian app : Follow us : About Us Contact Us Terms & Conditions Privacy Policy XXXX Experian. All rights reserved. \nExperian and the Experian trademarks used herein are trademarks or registered trademarks of Experian Information Solutions , Inc., ConsumerInfo.com XXXX IncXXXX or its affiliates. Other product or company names mentioned herein are the property of their respective owners. Licenses and Disclosures. \n\nHard inquiries XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX Soft inquiries Hard inquiries XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX XXXX XXXX Inquired on XX/XX/XXXX On record until XX/XX/XXXX Soft inquiries Experian Information Solutions Inc. 89120.0 NaN
169976 Money transfer, virtual currency, or money service~~Money order NaN NaN Redstone Funding, LLC 35673 NaN
In [ ]:
analysis_df.rename(columns={"Consumer complaint narrative": "complaint", "ZIP code": "Zip", "Products": "Product", "Issues": "Issue"}, inplace=True)  # type: ignore
<ipython-input-14-8b7d2ffb9d8f>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analysis_df.rename(columns={"Consumer complaint narrative": "complaint", "ZIP code": "Zip", "Products": "Product", "Issues": "Issue"}, inplace=True)  # type: ignore
In [ ]:
analysis_df.columns
Out[ ]:
Index(['Product', 'Issue', 'complaint', 'Company', 'Zip', 'Tags'], dtype='object')
In [ ]:
analysis_df.value_counts("Product")  # type: ignore
Out[ ]:
Product
Credit reporting, credit repair services, or other personal consumer reports~~Credit reporting    112119
Credit card or prepaid card~~General-purpose credit card or charge card                            11093
Checking or savings account~~Checking account                                                       9754
Debt collection~~I do not know                                                                      9445
Mortgage~~Other mortgage                                                                            8603
                                                                                                   ...  
Other financial service~~Foreign currency exchange                                                     6
Vehicle loan or lease~~Title loan                                                                      4
Payday loan, title loan, or personal loan~~Pawn loan                                                   3
Prepaid card~~Transit card                                                                             2
Virtual currency~~Domestic (US) money transfer                                                         2
Length: 92, dtype: int64
In [ ]:
analysis_df.value_counts("Company")  # type: ignore
Out[ ]:
Company
EQUIFAX, INC.                             43687
TRANSUNION INTERMEDIATE HOLDINGS, INC.    34533
Experian Information Solutions Inc.       32268
BANK OF AMERICA, NATIONAL ASSOCIATION     11351
WELLS FARGO & COMPANY                      9864
                                          ...  
ROCKY MOUNTAIN MORTGAGE COMPANY               1
Franklin Financial  Corporation               1
Franks Adjustment Bureau, Inc.                1
RNN Group, Inc                                1
snw investments                               1
Length: 3634, dtype: int64
In [ ]:
analysis_df.dropna(subset=["complaint"], inplace=True)  # type: ignore
<ipython-input-18-d65b03af9762>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analysis_df.dropna(subset=["complaint"], inplace=True)  # type: ignore
In [ ]:
analysis_df.shape
Out[ ]:
(95341, 6)
In [ ]:
punctuation = list(punctuation)
stop_words = stopwords.words('english')
porter = PorterStemmer()
lancaster = LancasterStemmer()
englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
wordnet_lemmatizer = WordNetLemmatizer()
In [ ]:
def data_cleaner(complaint_text: str):
    complaint_text = complaint_text.lower()
    complaint_text = complaint_text.replace("\n", " ")
    complaint_text = complaint_text.replace("\r", " ")
    complaint_text = complaint_text.replace("\t", " ")
    complaint_text = complaint_text.replace(".", "")
    complaint_text = complaint_text.replace("\xa0", " ")
    complaint_text = complaint_text.replace("xx/xx/xxxx", " ")
    complaint_text = complaint_text.replace("xxxx", " ")
    complaint_text = complaint_text.replace("xx", " ")
    tokens = word_tokenize(complaint_text)
    complaint_text = " ".join([token for token in tokens if token not in stop_words and token not in punctuation and not token.isdigit() and len(token) > 3])
    re.sub(r"\d+", "", complaint_text)
    # re.sub(r"\d+", "", complaint_text)
    # re.sub(r"\{\$\d+\.\d+\}", "", complaint_text)
    re.sub(r"[^A-Za-z]+", "", complaint_text)
    complaint_text = " ".join([wordnet_lemmatizer.lemmatize(word) for word in complaint_text.split()])
    return complaint_text
In [ ]:
'''
    target_text = data_cleaner(analysis_df["complaint"].iloc[1])
    t0 = time()
    for _ in range(10000):
        " ".join([wordnet_lemmatizer.lemmatize(word) for word in    rget_text.split()])
    print("Lemmatization took %.2f seconds" % (time() - t0))
    t0 = time()
    for _ in range(10000):
        " ".join([englishStemmer.stem(word) for word in target_text.split()
    print("English Stemming took %.2f seconds" % (time() - t0))
    t0 = time()
    for _ in range(10000):
        " ".join([porter.stem(word) for word in target_text.split()])
    print("Porter Stemming took %.2f seconds" % (time() - t0))
    t0 = time()
    for _ in range(10000):
        " ".join([lancaster.stem(word) for word in target_text.split()])
    print("Lancaster Stemming took %.2f seconds" % (time() - t0))
'''
Out[ ]:
'\n    target_text = data_cleaner(analysis_df["complaint"].iloc[1])\n    t0 = time()\n    for _ in range(10000):\n        " ".join([wordnet_lemmatizer.lemmatize(word) for word in    rget_text.split()])\n    print("Lemmatization took %.2f seconds" % (time() - t0))\n    t0 = time()\n    for _ in range(10000):\n        " ".join([englishStemmer.stem(word) for word in target_text.split()\n    print("English Stemming took %.2f seconds" % (time() - t0))\n    t0 = time()\n    for _ in range(10000):\n        " ".join([porter.stem(word) for word in target_text.split()])\n    print("Porter Stemming took %.2f seconds" % (time() - t0))\n    t0 = time()\n    for _ in range(10000):\n        " ".join([lancaster.stem(word) for word in target_text.split()])\n    print("Lancaster Stemming took %.2f seconds" % (time() - t0))\n'
In [ ]:
analysis_df["complaint"] = analysis_df["complaint"].apply(data_cleaner)
<ipython-input-23-bfc8721173c5>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analysis_df["complaint"] = analysis_df["complaint"].apply(data_cleaner)
In [ ]:
analysis_df.head()
Out[ ]:
Product Issue complaint Company Zip Tags
2048480 Mortgage~~FHA mortgage NaN homesite home owner year mortgage company tax insurance escrow account paid fixed price tax gone gone changed insurance company liberty mutual yearly price notified bogman change monthly payment went called company time first speaking help manager telling paying year previous year insurance BOGMAN, INC 6410.0 NaN
2539072 Debt collection~~I do not know Attempts to collect debt not owed~~Debt is not yours name would like dispute inquiry credit incorrect believe removed contact email contact visit know right summary right fair credit reporting federal fair credit reporting fcra promotes accuracy fairness privacy information file consumer reporting agency many type consumer reporting agency including credit bureau specialty agency agency sell information check writing history medical record rental history record summary major right fcra information including information additional right wwwconsumerfinancegov/learnmore write consumer financial protection bureau street washington continue reading espaol para informacin espaol visite wwwconsumerfinancegov/learnmore escribe consumer financial protection bureau street washington service overview report score personal financesbeta financial profile protection credit card loan auto tool support dispute guide faq glossary education free experian follow contact term condition privacy policy experian right reserved experian experian trademark used herein trademark registered trademark experian information solution consumerinfocom affiliate product company name mentioned herein property respective owner license disclosure hard inquiry inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record soft inquiry hard inquiry inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record inquired record soft inquiry Experian Information Solutions Inc. 89120.0 NaN
1538365 Credit card or prepaid card~~General-purpose credit card or charge card Other features, terms, or problems~~Problem with rewards from credit card approved amex gold delta skymiles gold amex platinum delta skymiles platinum card gold card point bonus spending platinum card point bonus spending within first month term condition stated bonus would awarded within week hitting spend threshold bonus threshold gold card platinum card called american express couple week bonus threshold told bonus post automatically week date threshold call back point would manually applied account immediately week later bonus posted account called american express spoke manager named operator told awarded bonus total point manually posting account would point within 24-72 hour point never posted called american express back told point never posted even though told would post within 24-72 hour manager spoke said nothing could would wait point would post requested coordinate call back point post even though told would within 24-72 hour said would reach would call back within business day never called called american express spoke supervisor requesting point manually awarded supervisor acknowledged awarded point said nothing could supervisor also told need patient wait point post nothing additional could asked never called back said know would another request call assured time would call within business day also putting separate request someone higher american express would calling within day well discus issue offer resolution today still received bonus point either card heard absolutely nothing anyone else american express three month since spend threshold month since told bonus point would manually added account regular representative supervisor american express refusing assist offer explanation essentially told jump lake even though acknowledged received bonus american express action dishonest unethical expressly breached term card agreement entered applied card spent hour phone american express trying resolve calm peaceful manner avail respectfully request cfpb reach american express behalf request correct injustice award bonus point deserve thank AMERICAN EXPRESS COMPANY 11367.0 NaN
2056487 Mortgage~~Conventional fixed mortgage NaN bank america denying loan modification year BANK OF AMERICA, NATIONAL ASSOCIATION 95304.0 Older American
2423207 Credit reporting, credit repair services, or other personal consumer reports~~Credit reporting Incorrect information on your report~~Old information reappears or never goes away account violating fcra right deleting account year still reporting equifax beyond statute limitation reporting EQUIFAX, INC. 46804 NaN
In [ ]:
word_counts = analysis_df["complaint"].str.split().apply(lambda x: len(x))
In [ ]:
pd.Series(analysis_df["complaint"].str.split().apply(lambda x: len(x))).describe()
Out[ ]:
count    95341.000000
mean        77.247082
std         94.253957
min          0.000000
25%         26.000000
50%         50.000000
75%         93.000000
max       2574.000000
Name: complaint, dtype: float64
In [ ]:
analysis_df[analysis_df["complaint"].str.split().apply(lambda x: len(x)) < 10]["complaint"]
Out[ ]:
2056487                                   bank america denying loan modification year
2622762                         account closed debt paid full account show still owed
2697336                                         capital inquires date remember others
2315623                 unauthorized inquiry credit report must removed soon possible
2014006                                   money date receiving medicaid medicaid paid
                                              ...                                    
2163112                                                                              
427550                      creditor contacted knowledge debt contacted attorney debt
1983098    specialized loan service repeatedly denies application mortgage assistance
1070682                                               inquiry credit report recognize
1882059                       information transferred another company without consent
Name: complaint, Length: 4903, dtype: object
In [ ]:
analysis_df = analysis_df[analysis_df["complaint"].str.split().apply(lambda x: len(x)) > 10]
In [ ]:
pd.Series(analysis_df["complaint"].str.split().apply(lambda x: len(x))).describe()
Out[ ]:
count    89478.000000
mean        81.857138
std         95.498488
min         11.000000
25%         30.000000
50%         54.000000
75%         98.000000
max       2574.000000
Name: complaint, dtype: float64
In [ ]:
n_features = 1000
n_components = 20
n_top_words = 20
batch_size = 128
init = "nndsvda"
random_state = 42
num_words = 30
In [ ]:
count_vectorizer = CountVectorizer(stop_words=stop_words, max_df=0.95, min_df=0.01, max_features=n_features, ngram_range=(1, 2))
tf_idf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.95, min_df=0.01, max_features=n_features, ngram_range=(1, 2))
In [ ]:
X_train, X_test = train_test_split(analysis_df, test_size=0.2, random_state=1)
In [ ]:
n_samples = X_train.shape[0]
In [ ]:
X_train_count_vectors = count_vectorizer.fit_transform(X_train["complaint"])
X_test_count_vectors = count_vectorizer.fit(X_test["complaint"])
In [ ]:
X_train_tfidf_vectors = tf_idf_vectorizer.fit_transform(X_train["complaint"])
X_test_tfidf_vectors = tf_idf_vectorizer.fit(X_test["complaint"])
In [ ]:
inits = ['random', 'nndsvd', 'nndsvda', 'nndsvdar']
beta_losses = ['frobenius', 'kullback-leibler', 'itakura-saito']
solvers = ['cd', 'mu']
losses = [0, 1]
# inits = ['random', 'nndsvd', 'nndsvda', 'nndsvdar']
# beta_losses = ['frobenius', 'kullback-leibler', 'itakura-saito']
# solvers = ['cd', 'mu']
# losses = [0, 1]
In [ ]:
# Fit the NMF model
print("Fitting the NMF model with tf-idf features n_samples=%d and n_features=%d..." % (n_samples, n_features))
# nmf_results = []
# for init, beta_loss, solver, loss in itertools.product(inits, beta_losses, solvers, losses):
#     if solver == 'cd' and beta_loss in ['kullback-leibler', 'itakura-saito']:
#         continue
#     print(f"Fitting the NMF model for {beta_loss}")
#     print(f"Parameters: init={init}, beta_loss={beta_loss}, solver={solver}, loss={loss}")
t0 = time()
nmf = NMF(
    n_components=n_components,
    random_state=1,
    init=init,
    beta_loss="frobenius",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=1,
).fit(X_train_tfidf_vectors)
# nmf_results.append((init, beta_loss, solver, loss, W))
print("done in %0.3fs." % (time() - t0))
Fitting the NMF model with tf-idf features n_samples=71582 and n_features=1000...
done in 9.031s.
In [ ]:
W = nmf.transform(X_train_tfidf_vectors)
H = nmf.components_
print(W.shape, H.shape)
(71582, 20) (20, 1000)
In [ ]:
nmf.get_feature_names_out()
Out[ ]:
array(['nmf0', 'nmf1', 'nmf2', 'nmf3', 'nmf4', 'nmf5', 'nmf6', 'nmf7',
       'nmf8', 'nmf9', 'nmf10', 'nmf11', 'nmf12', 'nmf13', 'nmf14',
       'nmf15', 'nmf16', 'nmf17', 'nmf18', 'nmf19'], dtype=object)
In [ ]:
vocab = np.array(tf_idf_vectorizer.get_feature_names_out())
In [ ]:
def top_words(t, num_words):
    return [vocab[i] for i in np.argsort(t)[: -num_words - 1: -1]]
In [ ]:
topic_words = [top_words(t, num_words) for t in H]
topics = [", ".join(t) for t in topic_words]
display(topics[:3])
['repeatedly, credit, credit report, remove, scam, remaining, investigated, remove account, credit score, place, account deleted, negatively, mentioned, nothing, plan, collect debt, national, still, need, limit, year, investigation, informed, tried, including, report, show, reply, public record, contacting',
 'would, business day, told, call back, time, aware, right, personal, missed payment, notified, county, receipt, sent, approximately, negatively, email, mistake, charged, year, even though, heard, customer service, similar, almost, best, interest rate, item credit, still, contacted, always',
 'debt collection, collect debt, code, collection practice, collection account, debt collector, value, deceptive, agreed, affidavit, order, agreement, collect, creditor, conversation, promptly, outstanding, original, post, validation, always, many, record, fdcpa, collection company, protection bureau, fair debt, note, best, civil']
In [ ]:
np.argmax(W)
Out[ ]:
125406
In [ ]:
np.argmax(W, axis=1)
Out[ ]:
array([14, 10,  1, ..., 16, 11,  3], dtype=int64)
In [ ]:
print(
    "\n" * 2,
    "Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
    "features, n_samples=%d and n_features=%d, batch_size=%d..."
    % (n_samples, n_features, batch_size),
)
t0 = time()
mbnmf = MiniBatchNMF(
    n_components=n_components,
    random_state=1,
    batch_size=batch_size,
    init=init,
    beta_loss="frobenius",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=0.5,
).fit(X_train_tfidf_vectors)
print("done in %0.3fs." % (time() - t0))

 Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf features, n_samples=71582 and n_features=1000, batch_size=128...
done in 6.286s.
In [ ]:
Wmb = mbnmf.transform(X_train_tfidf_vectors)
Hmb = mbnmf.components_
In [ ]:
print(Wmb.shape, Hmb.shape)
(71582, 20) (20, 1000)
In [ ]:
print(
    "\n" * 2,
    "Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)
t0 = time()
lda.fit(X_train_count_vectors.toarray())
print("done in %0.3fs." % (time() - t0))

 Fitting LDA models with tf features, n_samples=71582 and n_features=1000...
done in 111.890s.
In [ ]:
Wlda = lda.transform(X_train_count_vectors.toarray())
Hlda = lda.components_
print(Wlda.shape, Hlda.shape)
(71582, 20) (20, 1000)
In [ ]:
count_feature_names = count_vectorizer.get_feature_names_out()

topic_words = [top_words(t, num_words) for t in Hlda]
topics = [", ".join(t) for t in topic_words]
display(topics[:3])
['credit, repeatedly, credit report, initial, branch, credit bureau, remove, account account, reported, info, remaining, experience, company, tried, report credit, file, place, negatively, remove account, still, happen, national, contacting, dispute, credit file, including, time, need, would, three credit',
 'legal, sell, receipt, dispute, respond, info, reporting inaccurate, lost, statement, copy credit, representative, document, complaint, day late, 2021, cell, associated, security number, received call, day, 2018, send, promptly, additionally, prove, submitted, still, reasonable, almost, insurance company',
 'civil, agreed, reported, debt collection, agreement, collect debt, compliance, promptly, document, free, except, even though, complaint, fact, spent, stated, review, post, credit, account account, complete, keep, regulation, derogatory, much, every, cover, regarding, timely, mark']
In [ ]:
lda.perplexity(X_train_count_vectors.toarray())
Out[ ]:
378.04097379660016