import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import seaborn as sns

import yellowbrick
from yellowbrick.features import Rank2D  # correlation visualization package
from yellowbrick.style import set_palette  # color for yellowbrick visualizer

from scipy.stats import spearmanr
from scipy.stats import kendalltau

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB


# Loading subset of data into pandas dataframe, choosing columns and specifying data types
hotels_train_df = pd.read_csv('train.csv', 
                              usecols=['user_id', 'is_package', 'site_name', 'user_location_country', 
                                       'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 
                                       'hotel_cluster', 'hotel_continent'],
                              dtype={'is_package':bool}, # changing data type to boolean
                              nrows = 500000)


# Previewing data
hotels_train_df.head(10)


# Summary of data
hotels_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 9 columns):
site_name                500000 non-null int64
user_location_country    500000 non-null int64
user_id                  500000 non-null int64
is_package               500000 non-null bool
srch_adults_cnt          500000 non-null int64
srch_children_cnt        500000 non-null int64
srch_destination_id      500000 non-null int64
hotel_continent          500000 non-null int64
hotel_cluster            500000 non-null int64
dtypes: bool(1), int64(8)
memory usage: 31.0 MB


# Summary information for columns
hotels_train_df.describe()


# Summary information for columns without scientific notation

with pd.option_context('float_format', '{:f}'.format):
    print(hotels_train_df.describe())

          site_name  user_location_country        user_id  srch_adults_cnt  \
count 500000.000000          500000.000000  500000.000000    500000.000000   
mean       9.596348              84.890906  519213.302920         2.029878   
std       12.209567              55.099574  339983.175557         0.918030   
min        2.000000               0.000000      12.000000         0.000000   
25%        2.000000              66.000000  215696.000000         2.000000   
50%        2.000000              66.000000  444365.000000         2.000000   
75%       13.000000              69.000000  813517.000000         2.000000   
max       53.000000             239.000000 1198784.000000         9.000000   

       srch_children_cnt  srch_destination_id  hotel_continent  hotel_cluster  
count      500000.000000        500000.000000    500000.000000  500000.000000  
mean            0.328476         14520.732588         3.211660      49.821640  
std             0.729282         11074.285557         1.657219      28.956834  
min             0.000000             1.000000         0.000000       0.000000  
25%             0.000000          8267.000000         2.000000      25.000000  
50%             0.000000          9927.000000         2.000000      49.000000  
75%             0.000000         18965.000000         4.000000      73.000000  
max             9.000000         65035.000000         6.000000      99.000000


# Displaying summary information for boolean 'is_package' column
print(hotels_train_df.describe(include=[bool]))

       is_package
count      500000
unique          2
top         False
freq       372236


# Checking missing data sums
hotels_train_df.isna().sum()

site_name                0
user_location_country    0
user_id                  0
is_package               0
srch_adults_cnt          0
srch_children_cnt        0
srch_destination_id      0
hotel_continent          0
hotel_cluster            0
dtype: int64


# User country frequency using seaborn countplot

plt.figure(figsize=(15, 9))
plt.xticks(rotation=90)
sns.countplot(x='user_location_country', data=hotels_train_df)

<matplotlib.axes._subplots.AxesSubplot at 0x12d17dc18>


# Bar graph of the number of users in each country for the top ten countries

# Selecting and storing user country column
country_count = hotels_train_df['user_location_country'].value_counts()

# Limiting to top 10 countries
countries_topten = country_count[:10,]

plt.figure(figsize=(12,9))
sns.barplot(countries_topten.index, countries_topten.values, alpha=0.8)
plt.title('Top 10 Countries of Users')
plt.ylabel('Number of Observations', fontsize=12)
plt.xlabel('Country', fontsize=12)
plt.show()


# Boxplot of hotel cluster by hotel continent

plt.figure(figsize=(12,9))
sns.boxplot(x = hotels_train_df["hotel_continent"], y = hotels_train_df["hotel_cluster"], palette="Blues");
plt.show()


# Plot frequency of each hotel cluster

hotels_train_df["hotel_cluster"].value_counts().plot(kind='bar',colormap="Set3",figsize=(15,7))
plt.xlabel('Hotel Cluster', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Frequency of Hotel Clusters', fontsize=20)
plt.show()


# Pearson Ranking

# Setting up figure size
plt.rcParams['figure.figsize'] = (15, 9)

# Choosing attributes to compare
features = ['srch_destination_id', 'user_location_country', 'srch_adults_cnt', 'srch_children_cnt', 
            'hotel_continent', 'site_name']

# Extracting numpy arrays
X = hotels_train_df[features].values

# Instantiating, fitting, and transforming the visualizer with covariance ranking algorithm
visualizer = Rank2D(features=features, algorithm='pearson')
visualizer.fit(X)
visualizer.transform(X)
visualizer.poof(outpath="pearson_ranking.png") # Drawing the data and saving the output
plt.show()


# Convert hotel_cluster column to string
hotel_clusters = hotels_train_df['hotel_cluster'].astype(str)


hotel_clusters.describe(include=['O'])

count     500000
unique       100
top           91
freq       13958
Name: hotel_cluster, dtype: object


# Splitting the data into independent and dependent variables

features = ['srch_destination_id', 'user_location_country', 'srch_adults_cnt', 'srch_children_cnt', 
            'hotel_continent', 'site_name']

X = hotels_train_df[features].values
y = hotels_train_df['hotel_cluster'].values


# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

# Number of samples in each set
print("No. of samples in training set: ", X_train.shape[0])
print("No. of samples in test set:", X_test.shape[0])

No. of samples in training set:  375000
No. of samples in test set: 125000


# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


# Predicting test set results
y_pred = classifier.predict(X_test)

# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Actual Hotel Cluster'], colnames=['Predicted Hotel Cluster'])


# Confusion Matrix
confusion_matrix(y_test,y_pred)

array([[ 499,    5,    0, ...,    3,    0,    0],
       [   1, 1236,    0, ...,    0,    0,    0],
       [   0,    0,  349, ...,   47,   28,   28],
       ...,
       [  20,    0,   31, ...,  336,   51,    8],
       [  21,    1,   37, ...,   81,  348,    6],
       [   9,    3,   35, ...,   18,   10,  268]])


# Classification Report
print("Classification Report:\n\n", classification_report(y_test,y_pred))

Classification Report:

               precision    recall  f1-score   support

           0       0.20      0.40      0.26      1237
           1       0.27      0.86      0.41      1434
           2       0.20      0.24      0.22      1476
           3       0.22      0.13      0.16       722
           4       0.19      0.18      0.19      1116
           5       0.19      0.16      0.17      2024
           6       0.21      0.16      0.18      1311
           7       0.28      0.20      0.23       844
           8       0.26      0.39      0.31      1144
           9       0.18      0.18      0.18      1633
          10       0.18      0.12      0.15      1298
          11       0.17      0.15      0.15      1162
          12       0.26      0.26      0.26       838
          13       0.22      0.22      0.22      1062
          14       0.21      0.13      0.16       643
          15       0.23      0.17      0.20      1081
          16       0.22      0.18      0.20      1566
          17       0.19      0.09      0.12      1180
          18       0.20      0.14      0.17      1837
          19       0.15      0.08      0.11       868
          20       0.20      0.14      0.17      1076
          21       0.19      0.16      0.17      1819
          22       0.24      0.29      0.26      1019
          23       0.19      0.11      0.14       764
          24       0.28      0.06      0.11       557
          25       0.19      0.18      0.18      1852
          26       0.29      0.38      0.33      1372
          27       0.71      0.87      0.78       381
          28       0.20      0.22      0.21      1690
          29       0.23      0.24      0.24      1333
          30       0.18      0.14      0.16      1589
          31       0.25      0.20      0.22       869
          32       0.22      0.16      0.19       925
          33       0.19      0.12      0.15      1309
          34       0.19      0.19      0.19      1046
          35       0.19      0.16      0.17       450
          36       0.27      0.25      0.26      1483
          37       0.21      0.13      0.16      1767
          38       0.23      0.19      0.21       966
          39       0.21      0.21      0.21      1058
          40       0.16      0.13      0.15      1392
          41       0.14      0.14      0.14      2567
          42       0.24      0.22      0.23      1868
          43       0.26      0.23      0.24       859
          44       0.23      0.15      0.18      1022
          45       0.27      0.07      0.12       693
          46       0.27      0.40      0.32      1749
          47       0.21      0.16      0.18      1510
          48       0.24      0.21      0.23      2473
          49       0.15      0.09      0.11       773
          50       0.19      0.14      0.16      1564
          51       0.21      0.09      0.13      1220
          52       0.33      0.30      0.32      1081
          53       0.26      0.18      0.21       468
          54       0.53      0.30      0.38       814
          55       0.16      0.12      0.14      1408
          56       0.19      0.27      0.22      1271
          57       0.25      0.23      0.24      1039
          58       0.20      0.16      0.18      1601
          59       0.19      0.21      0.20      1880
          60       0.22      0.15      0.18       705
          61       0.20      0.13      0.16      1218
          62       0.25      0.23      0.24      1674
          63       0.28      0.46      0.35       918
          64       0.21      0.36      0.26      2439
          65       0.30      0.75      0.43      2024
          66       0.39      0.14      0.21       883
          67       0.27      0.26      0.26       893
          68       0.16      0.16      0.16      1752
          69       0.22      0.11      0.14      1197
          70       0.16      0.36      0.22      1909
          71       0.19      0.39      0.25       761
          72       0.17      0.13      0.15      1491
          73       0.18      0.14      0.16      1110
          74       0.91      0.68      0.78       164
          75       0.23      0.13      0.16       624
          76       0.20      0.11      0.14      1079
          77       0.18      0.16      0.17      1296
          78       0.24      0.16      0.19      1334
          79       0.20      0.12      0.15       878
          80       0.36      0.46      0.40       730
          81       0.26      0.24      0.25      1034
          82       0.29      0.31      0.30      1737
          83       0.14      0.19      0.16      1709
          84       0.18      0.16      0.17       986
          85       0.24      0.17      0.20      1223
          86       0.26      0.22      0.24       722
          87       0.27      0.12      0.17       739
          88       0.23      0.10      0.14       416
          89       0.24      0.14      0.17       842
          90       0.20      0.13      0.16      1359
          91       0.20      0.30      0.24      3484
          92       0.39      0.32      0.35       874
          93       0.21      0.10      0.13       790
          94       0.21      0.14      0.17       944
          95       0.15      0.31      0.20      1773
          96       0.17      0.12      0.14      1201
          97       0.19      0.21      0.20      1633
          98       0.21      0.18      0.19      1912
          99       0.21      0.18      0.19      1490

    accuracy                           0.22    125000
   macro avg       0.24      0.22      0.22    125000
weighted avg       0.22      0.22      0.21    125000


# Accuracy of model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.220376


# Training Naive Bayes classifier 
gnb = GaussianNB().fit(X_train, y_train) 
gnb_predictions = gnb.predict(X_test)


# Accuracy
accuracy = gnb.score(X_test, y_test) 
print("Accuracy:", accuracy)

Accuracy: 0.054088


# Confusion matrix 
confusion_matrix(y_test, gnb_predictions)

array([[   0,  452,    0, ...,    0,    0,    0],
       [   0, 1080,    0, ...,    0,    0,    0],
       [  18,  198,    0, ...,    0,   29,    0],
       ...,
       [  65,  407,    0, ...,    0,    7,    0],
       [  15,  830,    0, ...,    0,   60,    0],
       [  40,  199,    0, ...,    0,   14,    0]])

Creating Optimal Hotel Recommendations¶

Objective: Predict which “hotel cluster” the user is likely to book, given their search details.¶

Loading and Exploring Data¶

Loading Data¶

Exploratory Visualizations¶

Checking Correlation¶

Predictive Modeling¶

Splitting 'hotels_train_df' into train and test set¶

Random Forest Classifier¶

Evaluating Random Forest Classifier¶

Naive Bayes Classifier¶

Evaluating Naive Bayes Classifier¶

Results¶

	site_name	user_location_country	user_id	is_package	srch_adults_cnt	srch_destination_id	hotel_continent	hotel_cluster
0	2	66	12	True	2	8250	2	1
1	2	66	12	True	2	8250	2	1
2	2	66	12	False	2	8250	2	1
3	2	66	93	False	2	14984	2	80
4	2	66	93	False	2	14984	2	21
5	2	66	93	False	2	14984	2	92
6	2	66	501	False	2	8267	2	41
7	2	66	501	True	2	8267	2	41
8	2	66	501	False	2	8267	2	69
9	2	66	501	False	2	8267	2	70

	site_name	user_location_country	user_id	srch_adults_cnt	srch_children_cnt	srch_destination_id	hotel_continent	hotel_cluster
count	500000.000000	500000.000000	5.000000e+05	500000.000000	500000.000000	500000.000000	500000.000000	500000.000000
mean	9.596348	84.890906	5.192133e+05	2.029878	0.328476	14520.732588	3.211660	49.821640
std	12.209567	55.099574	3.399832e+05	0.918030	0.729282	11074.285557	1.657219	28.956834
min	2.000000	0.000000	1.200000e+01	0.000000	0.000000	1.000000	0.000000	0.000000
25%	2.000000	66.000000	2.156960e+05	2.000000	0.000000	8267.000000	2.000000	25.000000
50%	2.000000	66.000000	4.443650e+05	2.000000	0.000000	9927.000000	2.000000	49.000000
75%	13.000000	69.000000	8.135170e+05	2.000000	0.000000	18965.000000	4.000000	73.000000
max	53.000000	239.000000	1.198784e+06	9.000000	9.000000	65035.000000	6.000000	99.000000

Predicted Hotel Cluster	0	1	2	3	4	5	6	7	8	9	...	90	91	92	93	94	95	96	97	98	99
Actual Hotel Cluster
0	499	5	0	1	6	7	7	0	0	4	...	5	18	6	0	1	20	38	3	0	0
1	1	1236	0	0	2	0	0	0	0	0	...	0	2	0	0	0	0	0	0	0	0
2	0	0	349	2	9	30	8	4	25	24	...	14	48	2	3	3	93	0	47	28	28
3	5	0	10	94	3	11	4	1	4	6	...	12	12	3	6	0	10	3	2	3	7
4	10	0	5	5	199	7	19	3	3	12	...	3	52	0	3	3	91	4	10	26	8
5	39	0	45	7	16	329	12	6	34	33	...	23	48	1	10	6	34	17	26	14	25
6	21	0	13	1	21	23	211	1	12	26	...	5	68	1	2	4	42	8	6	11	22
7	3	0	3	3	5	12	10	167	15	4	...	6	73	1	11	8	6	1	7	0	3
8	2	0	5	2	6	24	6	3	441	9	...	7	7	8	6	0	4	8	16	2	18
9	65	0	33	0	12	19	8	0	32	302	...	14	34	8	1	1	107	3	34	40	42
10	21	33	19	4	12	32	14	12	18	26	...	8	41	4	3	1	23	4	12	13	23
11	18	0	43	1	4	41	6	4	17	18	...	2	18	1	2	2	19	3	52	10	9
12	3	2	7	0	0	13	6	6	1	1	...	0	1	3	0	0	1	0	4	0	8
13	16	0	6	0	8	9	12	14	2	20	...	7	88	0	3	5	28	8	5	12	5
14	2	2	4	2	2	13	8	6	26	3	...	7	29	0	3	11	12	2	4	2	7
15	8	5	15	2	5	10	21	14	11	16	...	12	63	1	3	6	2	4	11	0	22
16	16	0	19	6	20	23	29	10	3	8	...	9	129	5	4	28	54	3	8	11	11
17	34	1	15	2	30	26	6	1	7	21	...	5	34	2	4	0	21	11	16	30	4
18	39	71	28	5	22	11	21	6	0	38	...	6	165	0	3	16	181	11	9	39	13
19	25	102	1	1	23	4	7	2	0	8	...	2	37	2	0	1	100	5	1	20	4
20	10	0	7	17	1	16	2	2	20	9	...	13	10	4	6	0	2	2	5	3	7
21	57	0	42	0	23	19	10	1	6	30	...	7	60	0	2	4	159	13	64	64	15
22	2	0	14	5	3	29	5	1	93	7	...	7	11	12	4	1	5	3	7	0	27
23	10	3	5	1	26	8	7	3	0	4	...	3	44	0	0	3	28	3	5	8	0
24	0	455	0	0	0	0	0	1	0	0	...	0	2	0	0	0	0	0	0	0	0
25	20	0	47	1	17	43	11	2	14	33	...	9	40	5	6	3	54	9	73	49	16
26	124	0	3	2	3	0	1	2	3	2	...	2	33	24	3	1	4	31	4	3	1
27	0	0	0	0	0	0	0	0	5	0	...	0	0	2	0	1	0	0	2	0	0
28	23	0	27	1	32	12	15	6	10	35	...	17	151	0	2	16	66	5	6	18	7
29	0	0	70	2	0	49	6	3	8	7	...	6	2	0	5	0	4	0	14	1	13
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
70	14	1	7	2	14	11	2	2	7	37	...	2	70	2	2	3	57	5	46	121	5
71	20	161	0	1	0	1	0	0	0	0	...	5	12	0	1	1	4	6	0	0	0
72	32	0	18	1	20	27	10	6	2	32	...	5	111	1	6	12	91	13	6	31	6
73	43	0	7	4	0	14	3	4	4	6	...	28	14	16	5	2	20	16	15	22	6
74	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
75	12	0	3	2	5	7	8	4	9	5	...	8	10	5	3	0	8	8	5	3	8
76	14	3	2	4	12	10	13	7	10	10	...	3	46	1	2	11	22	7	16	7	8
77	4	0	8	1	9	4	13	9	6	7	...	6	88	0	4	5	18	8	10	6	4
78	1	6	15	12	0	19	2	2	71	2	...	19	2	6	8	1	3	5	9	0	30
79	7	604	0	0	2	1	2	1	0	0	...	0	0	0	0	0	0	2	0	0	0
80	59	0	1	1	2	6	3	0	1	0	...	7	4	21	0	0	8	4	0	1	1
81	0	3	5	9	0	4	14	3	23	3	...	4	3	2	4	1	0	0	2	0	8
82	0	0	40	9	2	22	12	5	17	8	...	13	5	0	3	0	1	0	5	3	40
83	67	15	8	3	11	44	10	5	10	30	...	5	82	1	4	12	43	7	15	32	15
84	79	70	1	1	3	6	1	1	2	1	...	1	29	30	0	2	12	23	9	1	1
85	3	0	17	11	0	29	4	9	10	9	...	10	3	1	4	0	2	0	4	4	19
86	12	0	6	0	3	8	0	0	15	1	...	1	6	19	2	2	8	5	17	4	4
87	4	0	0	2	0	2	4	0	1	1	...	1	2	0	1	0	0	4	5	6	8
88	0	325	0	0	1	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
89	17	2	11	3	6	11	3	6	27	0	...	9	20	1	5	4	23	8	14	4	7
90	19	18	14	6	9	25	8	7	16	9	...	181	43	6	18	4	19	19	15	10	13
91	40	44	21	3	33	20	39	23	6	33	...	30	1036	3	6	87	143	19	11	20	11
92	28	11	0	1	0	3	1	1	1	2	...	1	12	279	4	0	3	9	9	0	1
93	21	0	8	11	9	12	5	3	13	15	...	25	46	7	78	4	18	9	3	3	6
94	19	23	7	0	6	10	10	13	2	3	...	6	168	3	2	135	38	12	4	6	2
95	15	0	40	4	35	7	15	0	3	66	...	9	100	2	4	4	549	19	21	72	11
96	152	9	0	6	8	5	1	0	1	2	...	13	41	9	3	2	38	145	0	1	8
97	20	0	31	2	9	22	0	1	20	36	...	5	28	12	2	3	47	3	336	51	8
98	21	1	37	3	20	9	6	0	3	60	...	2	31	1	4	0	147	7	81	348	6
99	9	3	35	3	13	25	15	4	76	31	...	3	26	2	3	2	50	5	18	10	268