import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config Completer.use_jedi = False


df = pd.read_csv('./data/CIA_Country_Facts.csv')
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Country                             227 non-null    object 
 1   Region                              227 non-null    object 
 2   Population                          227 non-null    int64  
 3   Area (sq. mi.)                      227 non-null    int64  
 4   Pop. Density (per sq. mi.)          227 non-null    float64
 5   Coastline (coast/area ratio)        227 non-null    float64
 6   Net migration                       224 non-null    float64
 7   Infant mortality (per 1000 births)  224 non-null    float64
 8   GDP ($ per capita)                  226 non-null    float64
 9   Literacy (%)                        209 non-null    float64
 10  Phones (per 1000)                   223 non-null    float64
 11  Arable (%)                          225 non-null    float64
 12  Crops (%)                           225 non-null    float64
 13  Other (%)                           225 non-null    float64
 14  Climate                             205 non-null    float64
 15  Birthrate                           224 non-null    float64
 16  Deathrate                           223 non-null    float64
 17  Agriculture                         212 non-null    float64
 18  Industry                            211 non-null    float64
 19  Service                             212 non-null    float64
dtypes: float64(16), int64(2), object(2)
memory usage: 35.6+ KB


# check for missing values
df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          3
Infant mortality (per 1000 births)     3
GDP ($ per capita)                     1
Literacy (%)                          18
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                               22
Birthrate                              3
Deathrate                              4
Agriculture                           15
Industry                              16
Service                               15
dtype: int64


# check our continuous (numerical) features
df.describe().T


# we could have some outliers in the data
plt.figure(figsize=(10,5), dpi=200)
sns.boxplot(data=df, x='Population', y='Region')

plt.show()


# if we reduce the range of the box plot above, we will lose India and China as they are extreme  outliers for Asia
# (see new box plot below)
df.sort_values(by='Population', ascending=False)[['Country', 'Population']]


# we could have some outliers in the data
plt.figure(figsize=(10,5), dpi=200)
sns.boxplot(data=df, x='Population', y='Region')
plt.xlim(0, 0.4e9)

plt.show()


sns.histplot(data=df, x='Population');


sns.histplot(data=df[df['Population'] < 0.4e9], x='Population');


plt.figure(figsize=(8,5), dpi=100)
sns.barplot(data=df, x='Region', y='GDP ($ per capita)')
plt.xticks(rotation=90)
plt.title('GDP in \\$USD per Capita by Region')

plt.show()


def fig(x=8, y=4, d=100):
    return plt.figure(figsize=(x,y), dpi=d)


fig(y=6)
sns.scatterplot(data=df,
                x='GDP ($ per capita)',
                y='Phones (per 1000)',
                hue='Region',
                style='Region')
# plt.xscale("log")
plt.legend(loc=(1.05, 0.25))

plt.show()


fig(y=6, d=200)
sns.scatterplot(data=df,
                x='GDP ($ per capita)',
                y='Literacy (%)',
                hue='Region',
                style='Region')

# plt.xscale("log")
# plt.yscale("log")
plt.legend(loc=(1.05, 0.25))

plt.show()


sns.heatmap(data=df.corr(), cmap='rainbow');


sns.clustermap(data=df.corr());


# missing values by column
df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          3
Infant mortality (per 1000 births)     3
GDP ($ per capita)                     1
Literacy (%)                          18
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                               22
Birthrate                              3
Deathrate                              4
Agriculture                           15
Industry                              16
Service                               15
dtype: int64


df[df['Agriculture'].isnull()]['Country']

3            American Samoa
4                   Andorra
78                Gibraltar
80                Greenland
83                     Guam
134                 Mayotte
140              Montserrat
144                   Nauru
153      N. Mariana Islands
171            Saint Helena
174    St Pierre & Miquelon
177              San Marino
208       Turks & Caicos Is
221       Wallis and Futuna
223          Western Sahara
Name: Country, dtype: object


# fill missing values in the agriculture column with zero
df['Agriculture'].fillna(0, inplace=True)
df['Agriculture'].isnull().sum()

0


# check for other missing values
df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          3
Infant mortality (per 1000 births)     3
GDP ($ per capita)                     1
Literacy (%)                          18
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                               22
Birthrate                              3
Deathrate                              4
Agriculture                            0
Industry                              16
Service                               15
dtype: int64


# mean Climate for each Region
df.groupby('Region').mean()['Climate']

Region
ASIA (EX. NEAR EAST)                   1.962963
BALTICS                                3.000000
C.W. OF IND. STATES                    2.550000
EASTERN EUROPE                         3.111111
LATIN AMER. & CARIB                    2.033333
NEAR EAST                              1.666667
NORTHERN AFRICA                        1.500000
NORTHERN AMERICA                       2.000000
OCEANIA                                2.000000
SUB-SAHARAN AFRICA                     1.885417
WESTERN EUROPE                         3.095238
Name: Climate, dtype: float64


# fill missing climate values with the average for that country's region
df['Climate'] = df.groupby('Region').transform(lambda x: x.fillna(x.mean()))


df['Literacy (%)'] = df.groupby('Region').transform(lambda x: x.fillna(x.mean()))
df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          3
Infant mortality (per 1000 births)     3
GDP ($ per capita)                     1
Literacy (%)                           0
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                                0
Birthrate                              3
Deathrate                              4
Agriculture                            0
Industry                              16
Service                               15
dtype: int64


# which countries are missing net migration data?
df[df['Net migration'].isnull()]['Country']

47          Cook Islands
221    Wallis and Futuna
223       Western Sahara
Name: Country, dtype: object


# we found the updated values for two of the countries online (from cia.gov):
df.loc[47, 'Net migration'] = -28.58
df.loc[221, 'Net migration'] = -4.16

# for the remaining country, we can use the average for that region:
df['Net migration'] = df.groupby('Region').transform(lambda x: x.fillna(x.mean()))

# check for missing values
df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          0
Infant mortality (per 1000 births)     3
GDP ($ per capita)                     1
Literacy (%)                           0
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                                0
Birthrate                              3
Deathrate                              4
Agriculture                            0
Industry                              16
Service                               15
dtype: int64


df[df['Infant mortality (per 1000 births)'].isnull()]['Country']

47          Cook Islands
221    Wallis and Futuna
223       Western Sahara
Name: Country, dtype: object


# we found the updated values for two of the countries online (cia.gov):
df.loc[47, 'Infant mortality (per 1000 births)'] = 16.33
df.loc[221, 'Infant mortality (per 1000 births)'] = 4.06

# for the remaining country, we can use the average for that region:
df['Infant mortality (per 1000 births)'] = df.groupby('Region').transform(lambda x: x.fillna(x.mean()))

# check for missing values
df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          0
Infant mortality (per 1000 births)     0
GDP ($ per capita)                     1
Literacy (%)                           0
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                                0
Birthrate                              3
Deathrate                              4
Agriculture                            0
Industry                              16
Service                               15
dtype: int64


df[df['GDP ($ per capita)'].isnull()]['Country']

223    Western Sahara
Name: Country, dtype: object


# since Western Sahara has had so many missing values and we were not able to find any updated values, 
# we have decided to drop this country from the data set
df = df.drop([223], axis=0)


df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          0
Infant mortality (per 1000 births)     0
GDP ($ per capita)                     0
Literacy (%)                           0
Phones (per 1000)                      3
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                                0
Birthrate                              2
Deathrate                              3
Agriculture                            0
Industry                              15
Service                               15
dtype: int64


df[df['Birthrate'].isnull()]['Country']

181               Serbia
221    Wallis and Futuna
Name: Country, dtype: object


# we found the updated values for the two countries online (cia.gov):
df.loc[181, 'Birthrate'] = 8.74
df.loc[221, 'Birthrate'] = 12.49


df[df['Deathrate'].isnull()]['Country']

47          Cook Islands
181               Serbia
221    Wallis and Futuna
Name: Country, dtype: object


# we found the updated values for the three countries online (cia.gov):
df.loc[47, 'Deathrate'] = 8.89
df.loc[181, 'Deathrate'] = 13.49
df.loc[221, 'Deathrate'] = 5.74


# how many missing values do we have left?
df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          0
Infant mortality (per 1000 births)     0
GDP ($ per capita)                     0
Literacy (%)                           0
Phones (per 1000)                      3
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                                0
Birthrate                              0
Deathrate                              0
Agriculture                            0
Industry                              15
Service                               15
dtype: int64


df[df['Arable (%)'].isnull()]['Country']

85     Guernsey
134     Mayotte
Name: Country, dtype: object


df[df['Crops (%)'].isnull()]['Country']

85     Guernsey
134     Mayotte
Name: Country, dtype: object


df[df['Other (%)'].isnull()]['Country']

85     Guernsey
134     Mayotte
Name: Country, dtype: object


# we are going to drop these two countries as well
df = df.drop([85, 134], axis=0)
df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          0
Infant mortality (per 1000 births)     0
GDP ($ per capita)                     0
Literacy (%)                           0
Phones (per 1000)                      3
Arable (%)                             0
Crops (%)                              0
Other (%)                              0
Climate                                0
Birthrate                              0
Deathrate                              0
Agriculture                            0
Industry                              14
Service                               14
dtype: int64


df[df['Phones (per 1000)'].isnull()]['Country']

52         Cyprus
58     East Timor
140    Montserrat
Name: Country, dtype: object


# since we can find this data online, we will go ahead and fill these missing values with the mean from the corresponding region
df['Phones (per 1000)'] = df.groupby('Region').transform(lambda x: x.fillna(x.mean()))
df.isnull().sum()

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          0
Infant mortality (per 1000 births)     0
GDP ($ per capita)                     0
Literacy (%)                           0
Phones (per 1000)                      0
Arable (%)                             0
Crops (%)                              0
Other (%)                              0
Climate                                0
Birthrate                              0
Deathrate                              0
Agriculture                            0
Industry                              14
Service                               14
dtype: int64


df[['Industry', 'Service']].describe()


df.groupby('Region').mean()[['Industry', 'Service']]


fig()
sns.scatterplot(data=df, x='Industry', y='Service', hue='Region')
plt.legend(loc=(1.05, 0.25))
plt.show()


# fill the missing values for service and industry with the region's average
df['Industry'] = df.groupby('Region').transform(lambda x: x.fillna(x.mean()))
df['Service'] = df.groupby('Region').transform(lambda x: x.fillna(x.mean()))


# make sure there are no missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 224 entries, 0 to 226
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Country                             224 non-null    object 
 1   Region                              224 non-null    object 
 2   Population                          224 non-null    int64  
 3   Area (sq. mi.)                      224 non-null    int64  
 4   Pop. Density (per sq. mi.)          224 non-null    float64
 5   Coastline (coast/area ratio)        224 non-null    float64
 6   Net migration                       224 non-null    float64
 7   Infant mortality (per 1000 births)  224 non-null    float64
 8   GDP ($ per capita)                  224 non-null    float64
 9   Literacy (%)                        224 non-null    float64
 10  Phones (per 1000)                   224 non-null    float64
 11  Arable (%)                          224 non-null    float64
 12  Crops (%)                           224 non-null    float64
 13  Other (%)                           224 non-null    float64
 14  Climate                             224 non-null    float64
 15  Birthrate                           224 non-null    float64
 16  Deathrate                           224 non-null    float64
 17  Agriculture                         224 non-null    float64
 18  Industry                            224 non-null    float64
 19  Service                             224 non-null    float64
dtypes: float64(16), int64(2), object(2)
memory usage: 36.8+ KB


# CHECKPOINT
df_clean = df.copy()


# drop the country column
df_countries = df_clean.copy()
df_clean = df_clean.drop('Country', axis=1)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 224 entries, 0 to 226
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Region                              224 non-null    object 
 1   Population                          224 non-null    int64  
 2   Area (sq. mi.)                      224 non-null    int64  
 3   Pop. Density (per sq. mi.)          224 non-null    float64
 4   Coastline (coast/area ratio)        224 non-null    float64
 5   Net migration                       224 non-null    float64
 6   Infant mortality (per 1000 births)  224 non-null    float64
 7   GDP ($ per capita)                  224 non-null    float64
 8   Literacy (%)                        224 non-null    float64
 9   Phones (per 1000)                   224 non-null    float64
 10  Arable (%)                          224 non-null    float64
 11  Crops (%)                           224 non-null    float64
 12  Other (%)                           224 non-null    float64
 13  Climate                             224 non-null    float64
 14  Birthrate                           224 non-null    float64
 15  Deathrate                           224 non-null    float64
 16  Agriculture                         224 non-null    float64
 17  Industry                            224 non-null    float64
 18  Service                             224 non-null    float64
dtypes: float64(16), int64(2), object(1)
memory usage: 35.0+ KB


len(df_clean['Region'].unique())

11


# this will transform the single region column in 11 new dummy variables (one for each unique region)
X = pd.get_dummies(df_clean)
X


X.columns

Index(['Population', 'Area (sq. mi.)', 'Pop. Density (per sq. mi.)',
       'Coastline (coast/area ratio)', 'Net migration',
       'Infant mortality (per 1000 births)', 'GDP ($ per capita)',
       'Literacy (%)', 'Phones (per 1000)', 'Arable (%)', 'Crops (%)',
       'Other (%)', 'Climate', 'Birthrate', 'Deathrate', 'Agriculture',
       'Industry', 'Service', 'Region_ASIA (EX. NEAR EAST)         ',
       'Region_BALTICS                            ',
       'Region_C.W. OF IND. STATES ',
       'Region_EASTERN EUROPE                     ',
       'Region_LATIN AMER. & CARIB    ',
       'Region_NEAR EAST                          ',
       'Region_NORTHERN AFRICA                    ',
       'Region_NORTHERN AMERICA                   ',
       'Region_OCEANIA                            ',
       'Region_SUB-SAHARAN AFRICA                 ',
       'Region_WESTERN EUROPE                     '],
      dtype='object')


from sklearn.preprocessing import StandardScaler


# define, fit, and transform using the scaler here:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


from sklearn.cluster import KMeans


ssd = []
kmax = 31

# loop over a variety of K values
for K in range(2, kmax):
    
    # build Kmeans model using current value of K
    model = KMeans(n_clusters=K)
    
    # fit the model and assign clusters using the scaled features
    labels = model.fit_predict(X_scaled)
    
    # store the labels with our feature DataFrame
    kstring = "Cluster K" + str(K)
    df_countries[kstring] = labels
    
    # store the sum of squared distances
    ssd.append(model.inertia_)


fig()
plt.plot(range(2, kmax), ssd, 'ko--');
plt.title('Elbow Plot for K-Means Clustering')
plt.xlabel('K Value')
plt.ylabel('Sum of Squared Distance')
plt.xticks(np.arange(2, kmax, 2))
plt.show()


# determine a good cut-off point for the clusters
pd.Series(ssd).diff()

0            NaN
1    -604.203464
2    -381.829297
3    -304.034796
4    -282.022613
5    -249.853234
6    -288.553950
7    -272.309309
8    -252.930646
9    -233.585127
10   -251.169929
11   -184.845701
12   -128.841894
13    -93.620413
14    -70.499219
15    -80.475810
16    -76.101790
17    -27.868649
18    -48.570732
19    -73.111144
20    -54.090490
21    -42.739412
22    -18.619219
23    -35.603356
24    -37.244304
25    -36.248956
26    -35.930069
27    -12.819551
28    -26.679864
dtype: float64


ssd_col = pd.Series(ssd)
diff_col = pd.Series(ssd).diff()
k_values = np.arange(2, kmax)
ssd_data = pd.concat([ssd_col, diff_col], axis=1)

# pd.DataFrame(ssd_data)

ssd_data.columns = ['SSD', 'DIFF']
ssd_data['K-Values'] = k_values
ssd_data = ssd_data[['K-Values', 'SSD', 'DIFF']]
ssd_data


diff_vals = ssd_data['DIFF'][1:]
k_vals = ssd_data['K-Values'][1:]

fig()
sns.barplot(x=k_vals, y=diff_vals);
plt.xlabel('K Value')

plt.show()


# compute the percent difference for increasing from K to K+1 clusters
ssd_data['Percent_Diff'] = round(100.0 * ssd_data['DIFF'] / ssd_data['SSD'], 1)
ssd_data


diff_perc = ssd_data['Percent_Diff'][1:]
temp = diff_perc.sort_values()
temp

10   -14.5
1    -14.2
11   -11.9
9    -11.8
8    -11.4
7    -11.0
6    -10.5
2     -9.9
12    -9.1
4     -8.6
3     -8.5
5     -8.2
19    -7.7
13    -7.0
16    -6.9
15    -6.8
20    -6.0
14    -5.6
26    -5.2
21    -5.0
25    -5.0
24    -4.9
18    -4.7
23    -4.4
28    -4.1
17    -2.6
22    -2.2
27    -1.9
Name: Percent_Diff, dtype: float64


fig()
sns.barplot(x=k_vals, y=diff_perc);


# KMeans model with 3 clusters
K = 3
model = KMeans(n_clusters=3)

# determine labels for each country
labels = model.fit_predict(X_scaled)
labels

array([0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 2, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0])


# check correlation with clusters
X_temp = X.copy()
X_temp['Cluster K3'] = df_countries['Cluster K3']
X_temp.corr()['Cluster K3'].iloc[:-1].sort_values()

Region_SUB-SAHARAN AFRICA                    -0.888338
Birthrate                                    -0.733860
Deathrate                                    -0.701050
Agriculture                                  -0.554420
Other (%)                                    -0.153648
Region_BALTICS                                0.057045
Crops (%)                                     0.057991
Region_NORTHERN AMERICA                       0.073980
Region_NORTHERN AFRICA                        0.073980
Area (sq. mi.)                                0.093060
Pop. Density (per sq. mi.)                    0.094124
Region_C.W. OF IND. STATES                    0.116486
Region_EASTERN EUROPE                         0.116486
Coastline (coast/area ratio)                  0.131157
Region_NEAR EAST                              0.135793
Arable (%)                                    0.152822
Region_ASIA (EX. NEAR EAST)                   0.153556
Region_OCEANIA                                0.157475
Region_WESTERN EUROPE                         0.181259
Region_LATIN AMER. & CARIB                    0.245488
Industry                                      0.263250
Climate                                       0.263250
Phones (per 1000)                             0.263250
Literacy (%)                                  0.263250
Infant mortality (per 1000 births)            0.263250
Net migration                                 0.263250
Service                                       0.263250
Population                                    0.263250
GDP ($ per capita)                            0.384592
Name: Cluster K3, dtype: float64


fig()
sns.heatmap(X_temp.corr());


X_temp.corr()['Cluster K3'][:-3].sort_values().plot(kind='bar');


# check correlation with 6 clusters
X_temp['Cluster K6'] = df_countries['Cluster K6']
X_temp.corr()['Cluster K6'].iloc[:-1].sort_values()

Region_WESTERN EUROPE                        -0.535156
Other (%)                                    -0.330677
Region_SUB-SAHARAN AFRICA                    -0.284995
GDP ($ per capita)                           -0.238292
Birthrate                                    -0.232989
Pop. Density (per sq. mi.)                   -0.184307
Deathrate                                    -0.153014
Agriculture                                  -0.107966
Coastline (coast/area ratio)                 -0.036722
Region_ASIA (EX. NEAR EAST)                   0.017282
Region_NORTHERN AFRICA                        0.019663
Region_NEAR EAST                              0.036093
Crops (%)                                     0.046516
Region_OCEANIA                                0.053922
Region_LATIN AMER. & CARIB                    0.074027
Region_NORTHERN AMERICA                       0.138705
Area (sq. mi.)                                0.150236
Population                                    0.170406
Industry                                      0.170406
Climate                                       0.170406
Phones (per 1000)                             0.170406
Literacy (%)                                  0.170406
Infant mortality (per 1000 births)            0.170406
Net migration                                 0.170406
Service                                       0.170406
Region_C.W. OF IND. STATES                    0.187160
Region_BALTICS                                0.290536
Cluster K3                                    0.354847
Arable (%)                                    0.379257
Region_EASTERN EUROPE                         0.593278
Name: Cluster K6, dtype: float64


# check correlation with 14 clusters
X_temp['Cluster K14'] = df_countries['Cluster K14']
X_temp.corr()['Cluster K14'].iloc[:-1].sort_values()

Region_ASIA (EX. NEAR EAST)                  -0.555462
Birthrate                                    -0.219371
Region_WESTERN EUROPE                        -0.218064
Region_SUB-SAHARAN AFRICA                    -0.212522
Agriculture                                  -0.187314
Crops (%)                                    -0.185334
Region_BALTICS                               -0.153970
Deathrate                                    -0.142578
Service                                      -0.127814
Industry                                     -0.127814
Climate                                      -0.127814
Population                                   -0.127814
Phones (per 1000)                            -0.127814
Literacy (%)                                 -0.127814
Infant mortality (per 1000 births)           -0.127814
Net migration                                -0.127814
Arable (%)                                   -0.062396
Region_NEAR EAST                             -0.012639
Coastline (coast/area ratio)                 -0.000647
GDP ($ per capita)                            0.030943
Region_NORTHERN AFRICA                        0.041313
Pop. Density (per sq. mi.)                    0.083678
Other (%)                                     0.146658
Area (sq. mi.)                                0.162968
Region_OCEANIA                                0.200308
Region_EASTERN EUROPE                         0.216833
Cluster K3                                    0.221688
Region_LATIN AMER. & CARIB                    0.297028
Region_NORTHERN AMERICA                       0.330505
Cluster K6                                    0.340262
Region_C.W. OF IND. STATES                    0.368617
Name: Cluster K14, dtype: float64


# we stored our results in the feature data with new columns for each set of labels 
# produced by our different K-Means models: K={3, 6, 14}
df_countries[['Cluster K3', 'Cluster K6', 'Cluster K14']]


# install these modules if you do not have them already
# pip install "notebook>=5.3" "ipywidgets>=7.5"


# install plotly library
# pip install plotly==4.14.3


# load the country ISO codes
df_iso = pd.read_csv('./data/country_iso_codes.csv')
df_iso


# convert the ISO codes to a dictionary 
iso_map = df_iso.set_index('Country')['ISO Code'].to_dict()


# create a copy of our DataFrame that still contains the "Country" column
df_map = df_countries.copy()


# create a new column for the ISO codes
df_map['ISO CODE'] = df_countries['Country'].map(iso_map)
df_map


# import the choropleth plot from plotly
import plotly.express as px


# create map with our 3 clusters
f = px.choropleth(df_map, 
                    locations="ISO CODE",
                    color="Cluster K3",
                    hover_name="Country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
f.show()


# create map with our 6 clusters
f = px.choropleth(df_map, 
                    locations="ISO CODE",
                    color="Cluster K6",
                    hover_name="Country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
f.show()


# create map with our 14 clusters
f = px.choropleth(df_map, 
                    locations="ISO CODE",
                    color="Cluster K14",
                    hover_name="Country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
f.show()

	Country	Region	Population	Area (sq. mi.)	Pop. Density (per sq. mi.)	Coastline (coast/area ratio)	Net migration	Infant mortality (per 1000 births)	GDP ($ per capita)	Literacy (%)	Phones (per 1000)	Arable (%)	Crops (%)	Other (%)	Climate	Birthrate	Deathrate	Agriculture	Industry	Service
0	Afghanistan	ASIA (EX. NEAR EAST)	31056997	647500	48.0	0.00	23.06	163.07	700.0	36.0	3.2	12.13	0.22	87.65	1.0	46.60	20.34	0.380	0.240	0.380
1	Albania	EASTERN EUROPE	3581655	28748	124.6	1.26	-4.93	21.52	4500.0	86.5	71.2	21.09	4.42	74.49	3.0	15.11	5.22	0.232	0.188	0.579
2	Algeria	NORTHERN AFRICA	32930091	2381740	13.8	0.04	-0.39	31.00	6000.0	70.0	78.1	3.22	0.25	96.53	1.0	17.14	4.61	0.101	0.600	0.298
3	American Samoa	OCEANIA	57794	199	290.4	58.29	-20.71	9.27	8000.0	97.0	259.5	10.00	15.00	75.00	2.0	22.46	3.27	NaN	NaN	NaN
4	Andorra	WESTERN EUROPE	71201	468	152.1	0.00	6.60	4.05	19000.0	100.0	497.2	2.22	0.00	97.78	3.0	8.71	6.25	NaN	NaN	NaN

	count	mean	std	min	25%	50%	75%	max
Population	227.0	2.874028e+07	1.178913e+08	7026.000	437624.00000	4786994.000	1.749777e+07	1.313974e+09
Area (sq. mi.)	227.0	5.982270e+05	1.790282e+06	2.000	4647.50000	86600.000	4.418110e+05	1.707520e+07
Pop. Density (per sq. mi.)	227.0	3.790471e+02	1.660186e+03	0.000	29.15000	78.800	1.901500e+02	1.627150e+04
Coastline (coast/area ratio)	227.0	2.116533e+01	7.228686e+01	0.000	0.10000	0.730	1.034500e+01	8.706600e+02
Net migration	224.0	3.812500e-02	4.889269e+00	-20.990	-0.92750	0.000	9.975000e-01	2.306000e+01
Infant mortality (per 1000 births)	224.0	3.550696e+01	3.538990e+01	2.290	8.15000	21.000	5.570500e+01	1.911900e+02
GDP ($ per capita)	226.0	9.689823e+03	1.004914e+04	500.000	1900.00000	5550.000	1.570000e+04	5.510000e+04
Literacy (%)	209.0	8.283828e+01	1.972217e+01	17.600	70.60000	92.500	9.800000e+01	1.000000e+02
Phones (per 1000)	223.0	2.360614e+02	2.279918e+02	0.200	37.80000	176.200	3.896500e+02	1.035600e+03
Arable (%)	225.0	1.379711e+01	1.304040e+01	0.000	3.22000	10.420	2.000000e+01	6.211000e+01
Crops (%)	225.0	4.564222e+00	8.361470e+00	0.000	0.19000	1.030	4.440000e+00	5.068000e+01
Other (%)	225.0	8.163831e+01	1.614083e+01	33.330	71.65000	85.700	9.544000e+01	1.000000e+02
Climate	205.0	2.139024e+00	6.993968e-01	1.000	2.00000	2.000	3.000000e+00	4.000000e+00
Birthrate	224.0	2.211473e+01	1.117672e+01	7.290	12.67250	18.790	2.982000e+01	5.073000e+01
Deathrate	223.0	9.241345e+00	4.990026e+00	2.290	5.91000	7.840	1.060500e+01	2.974000e+01
Agriculture	212.0	1.508443e-01	1.467980e-01	0.000	0.03775	0.099	2.210000e-01	7.690000e-01
Industry	211.0	2.827109e-01	1.382722e-01	0.020	0.19300	0.272	3.410000e-01	9.060000e-01
Service	212.0	5.652830e-01	1.658410e-01	0.062	0.42925	0.571	6.785000e-01	9.540000e-01

	Country	Population
42	China	1313973713
94	India	1095351995
214	United States	298444215
95	Indonesia	245452739
27	Brazil	188078227
...	...	...
144	Nauru	13287
209	Tuvalu	11810
140	Montserrat	9439
171	Saint Helena	7502
174	St Pierre & Miquelon	7026

	Industry	Service
count	210.000000	210.000000
mean	0.283581	0.564619
std	0.138022	0.164897
min	0.020000	0.062000
25%	0.196500	0.431500
50%	0.272500	0.571000
75%	0.341500	0.677500
max	0.906000	0.954000

	Industry	Service
Region
ASIA (EX. NEAR EAST)	0.302143	0.520107
BALTICS	0.293333	0.661667
C.W. OF IND. STATES	0.328000	0.480167
EASTERN EUROPE	0.309250	0.598667
LATIN AMER. & CARIB	0.256116	0.650721
NEAR EAST	0.406000	0.530000
NORTHERN AFRICA	0.426200	0.438400
NORTHERN AMERICA	0.199333	0.787000
OCEANIA	0.215250	0.608937
SUB-SAHARAN AFRICA	0.266878	0.449755
WESTERN EUROPE	0.252435	0.707870

CIA Country Analysis and Clustering¶

Country and Region Data¶

Part 1: Exploratory Data Analysis¶

Part 2: Data Preparation and Model Discovery¶

Part 3: Feature Preparation¶

Feature Scaling¶

Part 4: K-Means Clustering¶

Part 5: Model Interpretation¶

Option 1: Choosing K=3¶

Option 2: Choosing K=6¶

Option 3: Choosing K=14¶

Part 6: Geographical Model Interpretation¶

	K-Values	SSD	DIFF
0	2	4856.751461	NaN
1	3	4252.547997	-604.203464
2	4	3870.718700	-381.829297
3	5	3566.683904	-304.034796
4	6	3284.661292	-282.022613
5	7	3034.808057	-249.853234
6	8	2746.254107	-288.553950
7	9	2473.944798	-272.309309
8	10	2221.014152	-252.930646
9	11	1987.429025	-233.585127
10	12	1736.259096	-251.169929
11	13	1551.413395	-184.845701
12	14	1422.571500	-128.841894
13	15	1328.951087	-93.620413
14	16	1258.451868	-70.499219
15	17	1177.976058	-80.475810
16	18	1101.874268	-76.101790
17	19	1074.005619	-27.868649
18	20	1025.434888	-48.570732
19	21	952.323744	-73.111144
20	22	898.233254	-54.090490
21	23	855.493842	-42.739412
22	24	836.874623	-18.619219
23	25	801.271267	-35.603356
24	26	764.026963	-37.244304
25	27	727.778007	-36.248956
26	28	691.847938	-35.930069
27	29	679.028386	-12.819551
28	30	652.348522	-26.679864

	Cluster K3	Cluster K6	Cluster K14
0	0	1	1
1	1	5	9
2	1	2	7
3	1	2	10
4	1	0	4
...	...	...	...
221	1	2	10
222	1	2	6
224	1	2	6
225	0	1	5
226	0	1	5

	Country	ISO Code
0	Afghanistan	AFG
1	Akrotiri and Dhekelia – See United Kingdom, The	Akrotiri and Dhekelia – See United Kingdom, The
2	Åland Islands	ALA
3	Albania	ALB
4	Algeria	DZA
...	...	...
296	Congo, Dem. Rep.	COD
297	Congo, Repub. of the	COG
298	Tanzania	TZA
299	Central African Rep.	CAF
300	Cote d'Ivoire	CIV

	Cluster K3	Cluster K6	Cluster K14
0	0	1	1
1	1	5	9
2	1	2	7
3	1	2	10
4	1	0	4
...	...	...	...
221	1	2	10
222	1	2	6
224	1	2	6
225	0	1	5
226	0	1	5

	Cluster K3	Cluster K6	Cluster K14
0	0	1	1
1	1	5	9
2	1	2	7
3	1	2	10
4	1	0	4
...	...	...	...
221	1	2	10
222	1	2	6
224	1	2	6
225	0	1	5
226	0	1	5