import pandas as pd
import numpy as np
df = pd.read_excel('survey_data2.xlsx')
df=df[['Age','Gender','State','Children','Salary','Agree or Not']]
df = pd.concat([df,pd.get_dummies(df['Age'], prefix='_', drop_first=True)],axis=1)
df.drop(['Age'],axis=1, inplace=True)
df = pd.concat([df,pd.get_dummies(df['State'], prefix='State', drop_first=True)],axis=1)
df.drop(['State'],axis=1, inplace=True)
df['Gender']=df['Gender']-1
df_copy = df.copy()
train_set = df_copy.sample(frac=0.80, random_state=0)
test_set = df_copy.drop(train_set.index)
test_set_labels = test_set.pop('Agree or Not')
train_set_labels = train_set.pop('Agree or Not')
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(train_set)
mlp = MLPClassifier(hidden_layer_sizes=(17,17,17,17),max_iter=1000) #13 is the number of nodes, with 3 layers
mlp.fit(scaled_features,train_set_labels)
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(17, 17, 17, 17), learning_rate='constant',
learning_rate_init=0.001, max_iter=1000, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=None,
shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
verbose=False, warm_start=False)
predictions = mlp.predict(train_set) #to predict with the test set you'd also have to scale it
C:\Users\Betsy\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet. % self.max_iter, ConvergenceWarning)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(train_set_labels, predictions))
[[184 0] [135 0]]
print(classification_report(train_set_labels, predictions))
precision recall f1-score support 0 0.58 1.00 0.73 184 1 0.00 0.00 0.00 135 accuracy 0.58 319 macro avg 0.29 0.50 0.37 319 weighted avg 0.33 0.58 0.42 319
C:\Users\Betsy\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
import re
text="The rain in Spain is mainly on the plain."
x=re.findall("ai",text)
x
['ai', 'ai', 'ai', 'ai']
x=re.split("\s", text)
x
['The', 'rain', 'in', 'Spain', 'is', 'mainly', 'on', 'the', 'plain.']
x=re.split("\s", text, 1)
x
['The', 'rain in Spain is mainly on the plain.']
x=re.sub('\s', "-", text)
x
'The-rain-in-Spain-is-mainly-on-the-plain.'
x=re.search(r"\bS\w+", text)
x.span()
(12, 17)
print(x.string)
The rain in Spain is mainly on the plain.
print(x.group())
Spain
x=re.findall('[mat]', text)
x
['a', 'a', 'm', 'a', 't', 'a']
x=re.findall('ain+', text)
x
['ain', 'ain', 'ain', 'ain']
#https://www.w3schools.com/python/python_regex.asp more code keys here
crimedf = pd.read_excel('crime_data.xlsx')
transactdf=pd.read_excel('transaction_data.xlsx')
crimedf.head()
Year | Population | Violent crime total | Murder and nonnegligent manslaughter | Forcible rape | Robbery | Aggravated assault | Property crime total | Burglary | Larceny-theft | Motor vehicle theft | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1960 | 179323175 | 288460 | 9110 | 17190 | 107840 | 154320 | 3095700 | 912100 | 1855400 | 328200 |
1 | 1961 | 182992000 | 289390 | 8740 | 17220 | 106670 | 156760 | 3198600 | 949600 | 1913000 | 336000 |
2 | 1962 | 185771000 | 301510 | 8530 | 17550 | 110860 | 164570 | 3450700 | 994300 | 2089600 | 366800 |
3 | 1963 | 188483000 | 316970 | 8640 | 17650 | 116470 | 174210 | 3792500 | 1086400 | 2297800 | 408300 |
4 | 1964 | 191141000 | 364220 | 9360 | 21420 | 130390 | 203050 | 4200400 | 1213200 | 2514400 | 472800 |
transactdf.head()
Transaction | Purchase Date | Customer ID | Gender | Marital Status | Homeowner | Children | Annual Income | City | State or Province | Country | Product Family | Product Department | Product Category | Units Sold | Revenue | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2014-12-18 | 7223 | F | S | Y | 2 | $30K - $50K | Los Angeles | CA | USA | Food | Snack Foods | Snack Foods | 5 | 27.38 |
1 | 2 | 2014-12-20 | 7841 | M | M | Y | 5 | $70K - $90K | Los Angeles | CA | USA | Food | Produce | Vegetables | 5 | 14.90 |
2 | 3 | 2014-12-21 | 8374 | F | M | N | 2 | $50K - $70K | Bremerton | WA | USA | Food | Snack Foods | Snack Foods | 3 | 5.52 |
3 | 4 | 2014-12-21 | 9619 | M | M | Y | 3 | $30K - $50K | Portland | OR | USA | Food | Snacks | Candy | 4 | 4.44 |
4 | 5 | 2014-12-22 | 1900 | F | S | Y | 3 | $130K - $150K | Beverly Hills | CA | USA | Drink | Beverages | Carbonated Beverages | 4 | 14.00 |
crimedf.dtypes
Year int64 Population int64 Violent crime total int64 Murder and nonnegligent manslaughter int64 Forcible rape int64 Robbery int64 Aggravated assault int64 Property crime total int64 Burglary int64 Larceny-theft int64 Motor vehicle theft int64 dtype: object
transactdf.dtypes
Transaction int64 Purchase Date datetime64[ns] Customer ID int64 Gender object Marital Status object Homeowner object Children int64 Annual Income object City object State or Province object Country object Product Family object Product Department object Product Category object Units Sold int64 Revenue float64 dtype: object
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
from statsmodels.tsa.api import acf, pacf, graphics
fig,ax = plt.subplots()
ax = crimedf.plot(ax=ax)
sns.set_style('darkgrid')
pd.plotting.register_matplotlib_converters()
sns.mpl.rc('figure', figsize=(16,6))
fig,ax = plt.subplots()
ax = crimedf.plot(ax=ax)
crime_rate=crimedf.copy()
crime_rate['Violent crime rate'] = crimedf['Violent crime total']/crimedf['Population']
crime_rate['Year']=crimedf['Year']
fig, ax = plt.subplots()
ax = crime_rate['Violent crime rate'].plot(ax=ax)
fig,ax = plt.subplots()
ax = transactdf.plot(ax=ax)
fig,ax = plt.subplots()
ax = transactdf['Revenue'].plot(ax=ax)
transact_small=transactdf.copy()
transact_small.drop(['Transaction'],axis=1, inplace=True)
transact_small.drop(['Customer ID'],axis=1, inplace=True)
import matplotlib.pylab as plt
pd.plotting.lag_plot(transact_small['Revenue'])
<AxesSubplot:xlabel='y(t)', ylabel='y(t + 1)'>
pd.plotting.lag_plot(crime_rate['Violent crime rate'])
<AxesSubplot:xlabel='y(t)', ylabel='y(t + 1)'>
pd.plotting.autocorrelation_plot(crime_rate['Violent crime rate'])
<AxesSubplot:xlabel='Lag', ylabel='Autocorrelation'>
transact_small['Revenue'].corr(transact_small['Revenue'].shift(50))
-0.004421155977020592
crime_rate['Violent crime rate'].corr(crime_rate['Violent crime rate'].shift(30))
-0.9654049596830483
from statsmodels.tsa.ar_model import AutoReg
#to set up training set for time series data, use the first 80% of the data, and test is the last 20%. don't randomize.
model = AutoReg(crime_rate['Violent crime rate'],1, old_names=False)
model_fitted = model.fit()
model_fitted.params
const 0.000291 Violent crime rate.L1 0.949613 dtype: float64
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(crime_rate['Violent crime rate'], lags=20)
plt.xlabel('Lags', fontsize=12)
plt.ylabel('Partial Autocorrelation', fontsize=12)
plt.show()
#based on the graph below, use lags of 1 and 2 in the model at least
from statsmodels.tsa.stattools import adfuller
result = adfuller(crime_rate['Violent crime rate'])
print('p-value: %.2f' % result[1])
p-value: 0.24
crime_rate['Difference'] = crime_rate['Violent crime rate'].diff()
result = adfuller(crime_rate['Difference'].dropna())
print('p-value: %.2f' % result[1])
p-value: 0.03
model = AutoReg(crime_rate['Difference'].dropna(),2, old_names=False)
model_fitted = model.fit()
C:\Users\Betsy\Anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:579: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting. ' ignored when e.g. forecasting.', ValueWarning)
model_fitted = model.fit()
model_fitted.params
const 0.000014 Difference.L1 0.669151 Difference.L2 -0.030646 dtype: float64