import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from scipy.stats import spearmanr
from scipy.stats import kendalltau

import yellowbrick
from yellowbrick.features import Rank2D  # correlation visualization package
from yellowbrick.style import set_palette  # color for yellowbrick visualizer
from yellowbrick.features import ParallelCoordinates

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC


# Loading data
file = "runs.csv"
horse_racing_data = pd.read_csv(file)


# Checking dimensions of the table
print("The dimension of the table is: ", horse_racing_data.shape)

The dimension of the table is:  (79447, 37)


# Previewing data
horse_racing_data.head(5)


# Checking missing data
horse_racing_data.isna().sum()

race_id                0
horse_no               0
horse_id               0
result                 0
won                    0
lengths_behind         0
horse_age              0
horse_country          2
horse_type             2
horse_rating           0
horse_gear             0
declared_weight        0
actual_weight          0
draw                   0
position_sec1          0
position_sec2          0
position_sec3          0
position_sec4      33221
position_sec5      69368
position_sec6      78151
behind_sec1            0
behind_sec2            0
behind_sec3            0
behind_sec4        33221
behind_sec5        69368
behind_sec6        78151
time1                  0
time2                  0
time3                  0
time4              33221
time5              69368
time6              78151
finish_time            0
win_odds               0
place_odds          3735
trainer_id             0
jockey_id              0
dtype: int64


# Removing columns with large amounts of missing data
horse_racing_data = horse_racing_data.drop(columns=['position_sec4', 'position_sec5', 'position_sec6', 
                                                    'behind_sec4', 'behind_sec5','behind_sec6', 'time4', 
                                                    'time5', 'time6', 'place_odds'])

# Looking at the data
horse_racing_data.head(5)


# What type of variables are in the table 

print("Describe Data:\n")
print(horse_racing_data.describe())

print("\nSummarized Data:\n")
print(horse_racing_data.describe(include=['O']))

Describe Data:

            race_id      horse_no      horse_id        result           won  \
count  79447.000000  79447.000000  79447.000000  79447.000000  79447.000000   
mean    3173.352814      6.905623   2204.410525      6.838597      0.080053   
std     1833.101494      3.760711   1275.049375      3.730498      0.271378   
min        0.000000      1.000000      0.000000      1.000000      0.000000   
25%     1586.000000      4.000000   1085.000000      4.000000      0.000000   
50%     3174.000000      7.000000   2209.000000      7.000000      0.000000   
75%     4764.500000     10.000000   3308.000000     10.000000      0.000000   
max     6348.000000     14.000000   4404.000000     14.000000      1.000000   

       lengths_behind     horse_age  horse_rating  declared_weight  \
count    79447.000000  79447.000000  79447.000000     79447.000000   
mean         6.108901      3.339346     61.034904      1104.953568   
std         33.636209      0.876763     11.748788        62.347597   
min         -0.500000      2.000000     10.000000       693.000000   
25%          1.750000      3.000000     60.000000      1062.000000   
50%          4.000000      3.000000     60.000000      1102.000000   
75%          6.750000      3.000000     60.000000      1146.000000   
max        999.000000     10.000000    138.000000      1369.000000   

       actual_weight  ...   behind_sec1   behind_sec2   behind_sec3  \
count   79447.000000  ...  79447.000000  79447.000000  79447.000000   
mean      122.729656  ...      3.378768      4.083972      4.509457   
std         6.305496  ...      4.282529      2.691107     16.541538   
min       103.000000  ...      0.150000      0.150000      0.000000   
25%       118.000000  ...      1.500000      1.750000      1.750000   
50%       123.000000  ...      3.000000      3.750000      3.750000   
75%       128.000000  ...      5.000000      5.750000      5.750000   
max       133.000000  ...    999.000000     60.250000    999.000000   

              time1         time2         time3   finish_time      win_odds  \
count  79447.000000  79447.000000  79447.000000  79447.000000  79447.000000   
mean      21.135438     22.928985     23.864054     85.322914     28.812977   
std        6.930518      3.599727      3.571163     18.512883     30.097375   
min       12.390000     19.990000     21.000000     55.160000      1.000000   
25%       14.120000     22.290000     23.230000     70.590000      7.700000   
50%       24.180000     22.870000     23.760000     83.350000     15.000000   
75%       25.360000     23.520000     24.410000    100.780000     38.000000   
max      999.000000    999.000000    999.000000    163.580000     99.000000   

         trainer_id     jockey_id  
count  79447.000000  79447.000000  
mean      79.793007     85.832341  
std       45.118874     54.338105  
min        0.000000      0.000000  
25%       47.000000     39.000000  
50%       75.000000     76.000000  
75%      118.000000    138.000000  
max      175.000000    185.000000  

[8 rows x 24 columns]

Summarized Data:

       horse_country horse_type horse_gear
count          79445      79445      79447
unique            16          9        822
top              AUS    Gelding         --
freq           29906      75503      57373


# Replacing missing values in horse type
horse_racing_data['horse_type'] = horse_racing_data['horse_type'].fillna('Unknown')

# Checking missing values
print(horse_racing_data['horse_type'].isna().sum())

0


# Replacing missing values in horse country
horse_racing_data['horse_country'] = horse_racing_data['horse_country'].fillna('Unknown')

# Checking missing values
print(horse_racing_data['horse_country'].isna().sum())

0


# Setting figure and size
plt.figure(figsize=(9, 6))
plt.subplot()

# Histogram of horse and jockey weight
plt.hist(horse_racing_data['declared_weight'], color='purple', bins=40)
plt.xlabel('Weight of Horse & Jockey (lbs)', fontsize=20)  # Adding an x-label
plt.ylabel('Counts', fontsize=20)  # Adding a y-label
plt.tick_params(axis='both', labelsize=15)  # Setting font size of labels

plt.show()


# Setting figure and size
plt.figure(figsize=(9, 6))
plt.subplot()

# Histogram of Hong Kong Jockey Club horse rating
plt.hist(horse_racing_data['horse_rating'], color='purple', bins=40)
plt.xlabel('HKJC Horse Rating', fontsize=20)  # Adding an x-label
plt.ylabel('Counts', fontsize=20)  # Adding a y-label
plt.tick_params(axis='both', labelsize=15)  # Setting font size of labels

plt.show()


# Setting figure and size
plt.figure(figsize=(9, 6))
plt.subplot()

# Histogram of horse ranking in section 1 of race
plt.hist(horse_racing_data['position_sec1'], color='purple', bins=25)
plt.xlabel('Race Section 1 - Horse Ranking', fontsize=20)  # Adding an x-label
plt.ylabel('Counts', fontsize=20)  # Adding a y-label
plt.tick_params(axis='both', labelsize=15)  # Setting font size of labels

plt.show()


# Setting figure and size
plt.figure(figsize=(9, 6))
plt.subplot()

# Histogram of horse ranking in section 1 of race
plt.hist(horse_racing_data['win_odds'], color='purple', bins=40)
plt.xlabel('Win Odds for Horse at Start of Race', fontsize=20)  # Adding an x-label
plt.ylabel('Counts', fontsize=20)  # Adding a y-label
plt.tick_params(axis='both', labelsize=15)  # Setting font size of labels

plt.show()


# Setting up the figure size
plt.rcParams['figure.figsize'] = (20, 10)

# Making subplots
fig, axes = plt.subplots(nrows = 2, ncols = 2)

# --WON--

# make the data read to feed into the visualizer
X_Won = horse_racing_data.replace({'won': {1: 'yes', 0: 'no'}}).groupby('won').size().reset_index(name='Counts')['won']
Y_Won = horse_racing_data.replace({'won': {1: 'yes', 0: 'no'}}).groupby('won').size().reset_index(name='Counts')['Counts']
# make the bar plot
axes[0, 0].bar(X_Won, Y_Won)
axes[0, 0].set_title('Won', fontsize=25)
axes[0, 0].set_ylabel('Counts', fontsize=20)
axes[0, 0].tick_params(axis='both', labelsize=15)

# --TYPE--

# make the data read to feed into the visualizer
X_HorseType = horse_racing_data.groupby('horse_type').size().reset_index(name='Counts')['horse_type']
Y_HorseType = horse_racing_data.groupby('horse_type').size().reset_index(name='Counts')['Counts']
# make the bar plot
axes[0, 1].bar(X_HorseType, Y_HorseType)
axes[0, 1].set_title('Horse Type', fontsize=25)
axes[0, 1].set_ylabel('Counts', fontsize=20)
axes[0, 1].tick_params(axis='both', labelsize=15)

# --COUNTRY--

# make the data read to feed into the visualizer
X_Country = horse_racing_data.groupby('horse_country').size().reset_index(name='Counts')['horse_country']
Y_Country = horse_racing_data.groupby('horse_country').size().reset_index(name='Counts')['Counts']
# make the bar plot
axes[1, 0].bar(X_Country, Y_Country)
axes[1, 0].set_title('Horse Country', fontsize=25)
axes[1, 0].set_ylabel('Counts', fontsize=20)
axes[1, 0].tick_params(axis='x', labelsize=15, labelrotation=70)
axes[1, 0].tick_params(axis='y', labelsize=15)

# --AGE--

# make the data read to feed into the visualizer
X_Age = horse_racing_data.groupby('horse_age').size().reset_index(name='Counts')['horse_age']
Y_Age = horse_racing_data.groupby('horse_age').size().reset_index(name='Counts')['Counts']
# make the bar plot
axes[1, 1].bar(X_Age, Y_Age)
axes[1, 1].set_title('Horse Age', fontsize=25)
axes[1, 1].set_ylabel('Counts', fontsize=20)
axes[1, 1].tick_params(axis='both', labelsize=15)

plt.show()


# Pearson Ranking

#set up the figure size
plt.rcParams['figure.figsize'] = (15, 7)

features = ['declared_weight', 'horse_rating', 'position_sec1', 'win_odds']

# extract the numpy arrays from the data frame
X = horse_racing_data[features].values

# instantiate the visualizer with the Covariance ranking algorithm
visualizer = Rank2D(features=features, algorithm='pearson')
visualizer.fit(X)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.poof(outpath="pearson_ranking.png") # Draw/show/poof the data
plt.show()


# calculate spearman's correlation - horse_rating vs position_sec1
coef, p = spearmanr(horse_racing_data['horse_rating'], horse_racing_data['position_sec1'])
print('horse_rating vs position_sec1:')
print('Spearmans correlation coefficient: %.3f' % coef)

# interpret the significance
alpha = 0.05
if p > alpha:
    print('Features are uncorrelated. p=%.5f' % p)
else:
    print('Features are correlated. p=%.5f' % p)
    
print('\n')
    
# calculate spearman's correlation - horse_rating vs win_odds
coef2, p2 = spearmanr(horse_racing_data['horse_rating'], horse_racing_data['win_odds'])
print('horse_rating vs win_odds:')
print('Spearmans correlation coefficient: %.3f' % coef2)

# interpret the significance
alpha = 0.05
if p2 > alpha:
    print('Features are uncorrelated. p=%.5f' % p2)
else:
    print('Features are correlated. p=%.5f' % p2)

horse_rating vs position_sec1:
Spearmans correlation coefficient: -0.012
Features are correlated. p=0.00048


horse_rating vs win_odds:
Spearmans correlation coefficient: 0.004
Features are uncorrelated. p=0.25680


# calculate kendall's correlation - horse_rating vs position_sec1
k_coef, k_p = kendalltau(horse_racing_data['horse_rating'], horse_racing_data['position_sec1'])
print('horse_rating vs position_sec1:')
print('Kendall correlation coefficient: %.3f' % k_coef)

# interpret the significance
alpha = 0.05
if k_p > alpha:
    print('Features are uncorrelated. p=%.3f' % k_p)
else:
    print('Features are correlated. p=%.3f' % k_p)
    
print('\n')    

# calculate kendall's correlation - horse_rating vs win_odds
k_coef2, k_p2 = kendalltau(horse_racing_data['horse_rating'], horse_racing_data['win_odds'])
print('horse_rating vs win_odds:')
print('Kendall correlation coefficient: %.3f' % k_coef2)

# interpret the significance
alpha = 0.05
if k_p2 > alpha:
    print('Features are uncorrelated. p=%.3f' % k_p2)
else:
    print('Features are correlated. p=%.3f' % k_p2)

horse_rating vs position_sec1:
Kendall correlation coefficient: -0.010
Features are correlated. p=0.000


horse_rating vs win_odds:
Kendall correlation coefficient: 0.003
Features are uncorrelated. p=0.207


# copy data to a new dataframe
horse_racing_data_norm = horse_racing_data.copy()


# normalize data to 0-1 range
for feature in features:
    horse_racing_data_norm[feature] = ((horse_racing_data[feature] - horse_racing_data[feature].mean()) 
                                       / (horse_racing_data[feature].max() - horse_racing_data[feature].min()))


# Extract the numpy arrays from the data frame
X = horse_racing_data_norm[features].values
y = horse_racing_data.won.values


#set up the figure size
plt.rcParams['font.size'] = 50

# Specify the features of interest and the classes of the target
classes = ['Won', 'Not-won']
features = ['declared_weight', 'horse_rating', 'position_sec1', 'win_odds']

# Instantiate the visualizer
visualizer = ParallelCoordinates(
    classes=classes, features=features,
    normalize='standard', sample=0.02, shuffle=True,
)

visualizer.fit_transform(X, y)      # Fit the data to the visualizer
visualizer.poof(outpath="parallelcoords.png") # Draw/show/poof the data
plt.show()


# Stacked bar charts to compare won/not won

#set up the figure size
plt.rcParams['figure.figsize'] = (20, 10)

# make subplots
fig, axes = plt.subplots(nrows = 2, ncols = 2)


# --COUNTRY--

# make the data read to feed into the visualizer
Country_won = horse_racing_data.replace({'won': {1: 'Won', 0: 'Not-won'}})[horse_racing_data['won']==1]['horse_country'].value_counts()
Country_not_won = horse_racing_data.replace({'won': {1: 'Won', 0: 'Not-won'}})[horse_racing_data['won']==0]['horse_country'].value_counts()
Country_not_won = Country_not_won.reindex(index = Country_won.index)
# make the bar plot
p1 = axes[0, 0].bar(Country_won.index, Country_won.values)
p2 = axes[0, 0].bar(Country_not_won.index, Country_not_won.values, bottom=Country_won.values)
axes[0, 0].set_title('Country', fontsize=25)
axes[0, 0].set_ylabel('Counts', fontsize=20)
axes[0, 0].tick_params(axis='both', labelsize=15)
axes[0, 0].legend((p1[0], p2[0]), ('Won', 'Not-won'), fontsize = 15)


# --TYPE--

# make the data read to feed into the visualizer
Type_won = horse_racing_data.replace({'won': {1: 'Won', 0: 'Not-won'}})[horse_racing_data['won']==1]['horse_type'].value_counts()
Type_not_won = horse_racing_data.replace({'won': {1: 'Won', 0: 'Not-won'}})[horse_racing_data['won']==0]['horse_type'].value_counts()
Type_not_won = Type_not_won.reindex(index = Type_won.index)
# make the bar plot
p3 = axes[0, 1].bar(Type_won.index, Type_won.values)
p4 = axes[0, 1].bar(Type_not_won.index, Type_not_won.values, bottom=Type_won.values)
axes[0, 1].set_title('Horse Type', fontsize=25)
axes[0, 1].set_ylabel('Counts', fontsize=20)
axes[0, 1].tick_params(axis='both', labelsize=15)
axes[0, 1].legend((p3[0], p4[0]), ('Won', 'Not-won'), fontsize = 15)


# --AGE--

# make the data read to feed into the visualizer
Age_won = horse_racing_data.replace({'won': {1: 'Won', 0: 'Not-won'}})[horse_racing_data['won']==1]['horse_age'].value_counts()
Age_not_won = horse_racing_data.replace({'won': {1: 'Won', 0: 'Not-won'}})[horse_racing_data['won']==0]['horse_age'].value_counts()
Age_not_won = Age_not_won.reindex(index = Age_won.index)
# make the bar plot
p5 = axes[1, 0].bar(Age_won.index, Age_won.values)
p6 = axes[1, 0].bar(Age_not_won.index, Age_not_won.values, bottom=Age_won.values)
axes[1, 0].set_title('Horse Age', fontsize=25)
axes[1, 0].set_ylabel('Counts', fontsize=20)
axes[1, 0].tick_params(axis='both', labelsize=15)
axes[1, 0].legend((p5[0], p6[0]), ('Won', 'Not-won'), fontsize = 15)

<matplotlib.legend.Legend at 0x1c29652d30>


# Replacing "Unknown" values with "NaN"
horse_racing_data = horse_racing_data.replace('Unknown', np.nan)


# Checking for missing values
print(horse_racing_data.isna().sum())

race_id            0
horse_no           0
horse_id           0
result             0
won                0
lengths_behind     0
horse_age          0
horse_country      2
horse_type         2
horse_rating       0
horse_gear         0
declared_weight    0
actual_weight      0
draw               0
position_sec1      0
position_sec2      0
position_sec3      0
behind_sec1        0
behind_sec2        0
behind_sec3        0
time1              0
time2              0
time3              0
finish_time        0
win_odds           0
trainer_id         0
jockey_id          0
dtype: int64


# Filling in missing horse_type and horse_country data with most represented values

horse_racing_data['horse_type'].fillna('Gelding', inplace=True)

horse_racing_data['horse_country'].fillna('AUS', inplace=True)


# Checking results

print('horse_type:\n', horse_racing_data['horse_type'].describe())

print('\nhorse_country:\n', horse_racing_data['horse_country'].describe())

horse_type:
 count       79447
unique          9
top       Gelding
freq        75505
Name: horse_type, dtype: object

horse_country:
 count     79447
unique       16
top         AUS
freq      29908
Name: horse_country, dtype: object


# log-transformation of win_odds
def log_transformation(data):
    return data.apply(np.log1p)

horse_racing_data['win_odds_log1p'] = log_transformation(horse_racing_data['win_odds'])

# check the data
print(horse_racing_data.describe())

            race_id      horse_no      horse_id        result           won  \
count  79447.000000  79447.000000  79447.000000  79447.000000  79447.000000   
mean    3173.352814      6.905623   2204.410525      6.838597      0.080053   
std     1833.101494      3.760711   1275.049375      3.730498      0.271378   
min        0.000000      1.000000      0.000000      1.000000      0.000000   
25%     1586.000000      4.000000   1085.000000      4.000000      0.000000   
50%     3174.000000      7.000000   2209.000000      7.000000      0.000000   
75%     4764.500000     10.000000   3308.000000     10.000000      0.000000   
max     6348.000000     14.000000   4404.000000     14.000000      1.000000   

       lengths_behind     horse_age  horse_rating  declared_weight  \
count    79447.000000  79447.000000  79447.000000     79447.000000   
mean         6.108901      3.339346     61.034904      1104.953568   
std         33.636209      0.876763     11.748788        62.347597   
min         -0.500000      2.000000     10.000000       693.000000   
25%          1.750000      3.000000     60.000000      1062.000000   
50%          4.000000      3.000000     60.000000      1102.000000   
75%          6.750000      3.000000     60.000000      1146.000000   
max        999.000000     10.000000    138.000000      1369.000000   

       actual_weight  ...   behind_sec2   behind_sec3         time1  \
count   79447.000000  ...  79447.000000  79447.000000  79447.000000   
mean      122.729656  ...      4.083972      4.509457     21.135438   
std         6.305496  ...      2.691107     16.541538      6.930518   
min       103.000000  ...      0.150000      0.000000     12.390000   
25%       118.000000  ...      1.750000      1.750000     14.120000   
50%       123.000000  ...      3.750000      3.750000     24.180000   
75%       128.000000  ...      5.750000      5.750000     25.360000   
max       133.000000  ...     60.250000    999.000000    999.000000   

              time2         time3   finish_time      win_odds    trainer_id  \
count  79447.000000  79447.000000  79447.000000  79447.000000  79447.000000   
mean      22.928985     23.864054     85.322914     28.812977     79.793007   
std        3.599727      3.571163     18.512883     30.097375     45.118874   
min       19.990000     21.000000     55.160000      1.000000      0.000000   
25%       22.290000     23.230000     70.590000      7.700000     47.000000   
50%       22.870000     23.760000     83.350000     15.000000     75.000000   
75%       23.520000     24.410000    100.780000     38.000000    118.000000   
max      999.000000    999.000000    163.580000     99.000000    175.000000   

          jockey_id  win_odds_log1p  
count  79447.000000    79447.000000  
mean      85.832341        2.918318  
std       54.338105        0.982657  
min        0.000000        0.693147  
25%       39.000000        2.163323  
50%       76.000000        2.772589  
75%      138.000000        3.663562  
max      185.000000        4.605170  

[8 rows x 25 columns]


# adjust skewed data (win_odds)
# check the distribution using histogram

# set up the figure size
plt.rcParams['figure.figsize'] = (10, 5)

plt.hist(horse_racing_data['win_odds_log1p'], bins=40)
plt.xlabel('win_odds_log1p', fontsize=20)
plt.ylabel('Counts', fontsize=20)
plt.tick_params(axis='both', labelsize=15)
#plt.show()


# get the categorical data
cat_features = ['horse_country', 'horse_type']
data_cat = horse_racing_data[cat_features]

# One Hot Encoding 
data_cat_dummies = pd.get_dummies(data_cat)

# check the data
print(data_cat_dummies.head(8))

   horse_country_ARG  horse_country_AUS  horse_country_BRZ  horse_country_CAN  \
0                  0                  1                  0                  0   
1                  0                  0                  0                  0   
2                  0                  0                  0                  0   
3                  0                  0                  0                  0   
4                  0                  0                  0                  0   
5                  0                  0                  0                  0   
6                  0                  0                  0                  0   
7                  0                  1                  0                  0   

   horse_country_FR  horse_country_GB  horse_country_GER  horse_country_GR  \
0                 0                 0                  0                 0   
1                 0                 0                  0                 0   
2                 0                 0                  0                 0   
3                 0                 0                  0                 0   
4                 0                 1                  0                 0   
5                 0                 0                  0                 0   
6                 0                 0                  0                 0   
7                 0                 0                  0                 0   

   horse_country_IRE  horse_country_ITY  ...  horse_country_ZIM  \
0                  0                  0  ...                  0   
1                  0                  0  ...                  0   
2                  0                  0  ...                  0   
3                  0                  0  ...                  0   
4                  0                  0  ...                  0   
5                  0                  0  ...                  0   
6                  0                  0  ...                  0   
7                  0                  0  ...                  0   

   horse_type_Brown  horse_type_Colt  horse_type_Filly  horse_type_Gelding  \
0                 0                0                 0                   1   
1                 0                0                 0                   1   
2                 0                0                 0                   1   
3                 0                0                 0                   1   
4                 0                0                 0                   1   
5                 0                0                 0                   1   
6                 0                0                 0                   1   
7                 0                0                 0                   1   

   horse_type_Grey  horse_type_Horse  horse_type_Mare  horse_type_Rig  \
0                0                 0                0               0   
1                0                 0                0               0   
2                0                 0                0               0   
3                0                 0                0               0   
4                0                 0                0               0   
5                0                 0                0               0   
6                0                 0                0               0   
7                0                 0                0               0   

   horse_type_Roan  
0                0  
1                0  
2                0  
3                0  
4                0  
5                0  
6                0  
7                0  

[8 rows x 25 columns]


# Removing columns containing little useful information
horse_racing_data = horse_racing_data.drop(columns=['race_id', 'horse_no', 'horse_id', 'trainer_id', 'jockey_id'])

# Looking at the data
horse_racing_data.head(5)


# Create features 
features = horse_racing_data.loc[:,['horse_rating', 'declared_weight', 'actual_weight', 'win_odds_log1p',
                                   'position_sec1']]

# Create target
target = horse_racing_data.won

# Display original data
print("Features:  ", features)
print ("Target:  ", target)

# create random forest classifier object
randomforest=RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=10)

# train model
model_rf = randomforest.fit(features, target)
print(model_rf)

Features:          horse_rating  declared_weight  actual_weight  win_odds_log1p  \
0                60           1020.0            133        2.370244   
1                60            980.0            133        2.833213   
2                60           1082.0            132        1.504077   
3                60           1118.0            127        3.688879   
4                60            972.0            131        3.931826   
5                60           1114.0            127        2.079442   
6                60            978.0            123        4.605170   
7                60           1170.0            128        2.564949   
8                60           1126.0            123        3.663562   
9                60           1072.0            125        3.688879   
10               60           1135.0            123        2.261763   
11               60           1018.0            123        3.178054   
12               60           1089.0            120        1.856298   
13               60           1027.0            113        2.484907   
14               60           1078.0            128        2.708050   
15               60           1257.0            132        3.367296   
16               60           1037.0            130        2.079442   
17               60           1168.0            126        2.564949   
18               60           1148.0            125        1.193922   
19               60           1057.0            121        3.367296   
20               60           1064.0            122        2.639057   
21               60           1132.0            117        3.784190   
22               60           1081.0            121        2.708050   
23               60           1059.0            121        2.397895   
24               60           1106.0            118        3.931826   
25               60           1095.0            103        3.091042   
26               60           1060.0            113        2.639057   
27               60           1110.0            108        3.871201   
28               60           1115.0            133        2.322388   
29               60           1091.0            126        4.605170   
...             ...              ...            ...             ...   
79417           102           1205.0            126        4.553877   
79418           100            999.0            122        2.484907   
79419            96           1098.0            133        2.282382   
79420            94           1017.0            131        2.708050   
79421            91           1132.0            128        3.610918   
79422            88           1104.0            125        2.219203   
79423            87           1125.0            124        2.890372   
79424            87           1220.0            122        2.944439   
79425            86           1065.0            123        4.605170   
79426            86           1153.0            123        2.890372   
79427            85           1113.0            122        0.955511   
79428            85           1193.0            112        3.178054   
79429            83           1188.0            120        4.605170   
79430            83           1103.0            118        4.605170   
79431            81           1121.0            116        4.605170   
79432            81           1124.0            116        3.761200   
79433            94           1004.0            131        3.401197   
79434            93           1096.0            130        2.639057   
79435            93           1218.0            130        3.258097   
79436            92           1172.0            129        3.850148   
79437            91           1147.0            128        0.955511   
79438            90           1150.0            127        4.442651   
79439            89           1154.0            126        3.610918   
79440            88           1060.0            125        2.054124   
79441            86           1068.0            123        4.060443   
79442            87           1191.0            122        4.605170   
79443            84           1070.0            119        4.605170   
79444            83           1148.0            120        4.605170   
79445            82           1266.0            119        1.722767   
79446            81           1092.0            118        4.605170   

       position_sec1  
0                  6  
1                 12  
2                  3  
3                  8  
4                 13  
5                 11  
6                  1  
7                  9  
8                  2  
9                 14  
10                 7  
11                10  
12                 5  
13                 4  
14                11  
15                12  
16                 8  
17                14  
18                 5  
19                 4  
20                13  
21                 3  
22                 9  
23                 6  
24                 7  
25                 1  
26                 2  
27                10  
28                 9  
29                10  
...              ...  
79417              3  
79418              9  
79419             11  
79420              4  
79421              9  
79422              7  
79423             13  
79424             12  
79425             14  
79426             10  
79427              1  
79428              6  
79429              3  
79430              2  
79431              5  
79432              8  
79433             12  
79434              6  
79435              9  
79436              8  
79437             14  
79438              1  
79439              4  
79440              2  
79441             13  
79442              7  
79443              5  
79444             11  
79445             10  
79446              3  

[79447 rows x 5 columns]
Target:   0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
5        0.0
6        0.0
7        1.0
8        0.0
9        0.0
10       0.0
11       0.0
12       0.0
13       0.0
14       0.0
15       0.0
16       0.0
17       0.0
18       1.0
19       0.0
20       0.0
21       0.0
22       0.0
23       0.0
24       0.0
25       0.0
26       0.0
27       0.0
28       0.0
29       0.0
        ... 
79417    0.0
79418    0.0
79419    0.0
79420    0.0
79421    0.0
79422    0.0
79423    0.0
79424    0.0
79425    0.0
79426    0.0
79427    1.0
79428    0.0
79429    0.0
79430    0.0
79431    0.0
79432    0.0
79433    0.0
79434    1.0
79435    0.0
79436    0.0
79437    0.0
79438    0.0
79439    0.0
79440    0.0
79441    0.0
79442    0.0
79443    0.0
79444    0.0
79445    0.0
79446    0.0
Name: won, Length: 79447, dtype: float64
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)


# Calculate feature importances
importances = model_rf.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [features.columns[i] for i in indices]

# Create plot
plt.figure()

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(features.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(features.shape[1]), names, rotation=90)

# Show plot
plt.show()


# Create a whole features dataset that can be used for train and validation data splitting

# Combine numerical features and dummie features together
features_model = ['horse_rating', 'declared_weight', 'actual_weight', 'win_odds_log1p', 'position_sec1']
data_model_X = pd.concat([horse_racing_data[features_model], data_cat_dummies], axis=1)

# Create a whole target dataset that can be used for train and validation data splitting
data_model_y = horse_racing_data.replace({'won': {1: 'won', 0: 'not_won'}})['won']

# separate data into training and validation and check the details of the datasets

# split the data
X_train, X_val, y_train, y_val = train_test_split(data_model_X, data_model_y, test_size =0.3, random_state=11)

# number of samples in each set
print("No. of samples in training set: ", X_train.shape[0])
print("No. of samples in validation set:", X_val.shape[0])

# won and not-won
print('\n')
print('No. of won and not-won in the training set:')
print(y_train.value_counts())

print('\n')
print('No. of won and not-won in the validation set:')
print(y_val.value_counts())

No. of samples in training set:  55612
No. of samples in validation set: 23835


No. of won and not-won in the training set:
not_won    51122
won         4490
Name: won, dtype: int64


No. of won and not-won in the validation set:
not_won    21965
won         1870
Name: won, dtype: int64


# Instantiate the classification model 
model_lr = LogisticRegression(solver='lbfgs', max_iter=150)

# The ConfusionMatrix visualizer taxes a model
classes = ['not_won','won']
cm = ConfusionMatrix(model_lr, classes=classes, percent=False)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit learn.
cm.score(X_val, y_val)

# change fontsize of the labels in the figure
for label in cm.ax.texts:
    label.set_size(20)

# How did we do?
cm.poof()

<matplotlib.axes._subplots.AxesSubplot at 0x1c2db68c88>


# Precision, Recall, and F1 Score

# set the size of the figure and the font size 
plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['font.size'] = 20

# Instantiate the visualizer
visualizer = ClassificationReport(model_lr, classes=classes)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_val, y_val)  # Evaluate the model on the test data
g = visualizer.poof()


# ROC and AUC
#Instantiate the visualizer
visualizer = ROCAUC(model_lr)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_val, y_val)  # Evaluate the model on the test data
g = visualizer.poof()

Analyzing Hong Kong horse racing data to predict which horses will win¶

Narrative:¶

Part 1: Graph Analysis¶

Load and preview data¶

Observations so far¶

Data cleanup & summaries¶

Questions that might help predict which horses will win:¶

Data summary information¶

Conclusions based on data summaries¶

Data Visualization: Histograms¶

Data Visualization: Bar Charts¶

Correlation: Pearson Ranking charts¶

Correlation: Spearman's rank & Kendall's rank¶

Data Visualization: Parallel Coordinates¶

Stacked Bar Charts¶

Part 2: Dimensionality and Feature Reduction¶

Log Transformation for highly skewed data¶

Converting categorical data into numbers (Country, Type)¶

Random Forest Classifier¶

Part 3 - Model Evaluation & Selection¶

Training - Splitting data into training and testing¶

Evaluation¶

i. Confusion Matrix¶

ii. Precision, Recall & F1 score¶

ROC curve¶

	horse_no	horse_id	result	lengths_behind	horse_age	horse_country	horse_type	horse_rating	...	time2	time3	time4	time5	time6	finish_time	win_odds	place_odds	trainer_id	jockey_id
0	1	3917	10	8.00	3	AUS	Gelding	60	...	21.59	23.86	24.62	NaN	NaN	83.92	9.7	3.7	118	2
1	2	2157	8	5.75	3	NZ	Gelding	60	...	21.99	23.30	23.70	NaN	NaN	83.56	16.0	4.9	164	57
2	3	858	7	4.75	3	NZ	Gelding	60	...	21.59	23.90	24.22	NaN	NaN	83.40	3.5	1.5	137	18
3	4	1853	9	6.25	3	SAF	Gelding	60	...	21.83	23.70	24.00	NaN	NaN	83.62	39.0	11.0	80	59
4	5	2796	6	3.75	3	GB	Gelding	60	...	21.75	23.22	23.50	NaN	NaN	83.24	50.0	14.0	9	154

	result	lengths_behind	horse_age	horse_country	horse_type	horse_rating	horse_gear	declared_weight	actual_weight	...	position_sec3	behind_sec1	behind_sec2	behind_sec3	time1	time2	time3	finish_time	win_odds	win_odds_log1p
0	10	8.00	3	AUS	Gelding	60	--	1020.0	133	...	6	2.00	2.00	1.50	13.85	21.59	23.86	83.92	9.7	2.370244
1	8	5.75	3	NZ	Gelding	60	--	980.0	133	...	13	6.50	9.00	5.00	14.57	21.99	23.30	83.56	16.0	2.833213
2	7	4.75	3	NZ	Gelding	60	--	1082.0	132	...	2	1.00	1.00	0.75	13.69	21.59	23.90	83.40	3.5	1.504077
3	9	6.25	3	SAF	Gelding	60	--	1118.0	127	...	11	3.50	5.00	3.50	14.09	21.83	23.70	83.62	39.0	3.688879
4	6	3.75	3	GB	Gelding	60	--	972.0	131	...	12	7.75	8.75	4.25	14.77	21.75	23.22	83.24	50.0	3.931826