# Precision and Recall ¶

Precision and recall evaluation, using the "Adult Data Set."

This dataset is used for predicting whether or not a person's annual income is greater than \$50,000.
We will construct a neural network model to demonstrate what can be done with this data and machine learning.

And, the Confusion Matrix and Classification Report convey a lot of information about evaluation.

• Precision is the accuracy rate of prediction.
• On the other hand, recall is the coverage rate of the labels.
• The f1-score is balanced between precision and recall.

The reference for the dataset is below.

Adult Data Set, Ronny Kohavi and Barry Becker, Data Mining and Visualization, Silicon Graphics.

## Required Libraries ¶

• matplotlib 2.0.2
• numpy 1.12.1
• scikit-learn 0.18.2
• pandas 0.20.3
In [1]:

from __future__ import division, print_function
import numpy as np
import pandas as pd

import renom as rm
from renom.cuda import set_cuda_active

from sklearn.preprocessing import LabelBinarizer, label_binarize
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
# If you would like to use GPU computing, set this to True. Otherwise, False.
set_cuda_active(False)


## Import column info ¶

Import the column names, and return a dictionary describing which columns are numeric and which are not.

In [2]:

def make_col_names():
col_names = []
continuous_dict = {}
if i > 96:
line = line.rstrip()
name = line.split(":")[0]
col_names.append(name)
line = line.replace(" ","").replace(".","")
continuous = line.split(":")[1] == "continuous"
continuous_dict[name] = continuous
col_names.append("label")
continuous_dict["label"] = False
return col_names, continuous_dict


## Get the column names and determine which columns are continuous ¶

In [3]:

n_id = 0
col_names, continuous_dicts = make_col_names()


## Load the data and make an Index ¶

Now we'll demonstrate dealing with missing values, and other pre-processing steps.

In [4]:

def load_data(filename, col_names, n, skiprows=None):
if skiprows:
else:

# Display the number of records before deleting missing values.
print("the number of {} records:{}\n".format(filename, len(df.index)))
df.columns = col_names

# Replace the missing value's character with np.nan.
df = df.applymap(lambda d: np.nan if d==" ?" else d)

# Unify the different written forms.
df = df.applymap(lambda d: " <=50K" if d==" <=50K." else d)
df = df.applymap(lambda d: " >50K" if d==" >50K." else d)

# Display information about missing values and
print("missing value info:\n{}\n".format(df.isnull().sum(axis=0)))
df = df.dropna(axis=0)

# the number of records after deleting missing valeus.
print("the number of {} records after trimming:{}\n".format(filename, len(df.index)))
ids = list(np.arange(n, n+len(df.index)))
df["ID"] = np.array(ids)
n = n+len(df.index)
return df,n


## Load the data and carry out some pre-processing on the pandas data frame ¶

 adult.data  is for training, the  adult.test  is for prediction.

In [5]:

df_train,n_id_train = load_data("adult.data", col_names, n_id)

the number of adult.data records:32561

missing value info:
age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
label                0
dtype: int64

the number of adult.data records after trimming:30162

missing value info:
age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
label               0
dtype: int64

the number of adult.test records after trimming:15060



## Get the non-continious columns ¶

In [6]:

def get_not_continuous_columns(continuous_dict):
categorical_names = [k for k, v in continuous_dict.items() if not v]
return categorical_names


## Display label information ¶

In [7]:

def print_labelinfo(labelnames):
for i in range(len(labelnames)):
print("label{}:{}".format(i,labelnames[i]))


## Convert the data from pandas data frame to numpy data array ¶

We convert the data from a categorical format to "one-hot" vector representation.

In [8]:

def convert_data(df_train, df_test, n_id_train, n_id_test, continuous_dicts):
categorical_names = get_not_continuous_columns(continuous_dicts)
df = pd.concat((df_train, df_test), axis=0)

# Get the dummy variables for the categorical data.
for name in categorical_names:
if name=="label":
labelnames = list(pd.get_dummies(df[name]).columns)
print("labelname:{}".format(labelnames))
dummy_df = pd.get_dummies(df[name])
df = pd.concat((df, dummy_df), axis=1)
df = df.drop(name, axis=1)

# Convert the data type.
for name in df.columns:
df[name] = df[name].astype(float)

# Reguralize the data.
for name in df.columns:
if name=="ID":
df[name] = df[name]
else:
df[name] = (df[name] - df[name].min()) / (df[name].max() - df[name].min())

df_train = df[df["ID"]<n_id_train].drop("ID", axis=1)
df_test = df[df["ID"]>=n_id_train].drop("ID", axis=1)

y_train = df_train[labelnames].values
y_test = df_test[labelnames].values
print_labelinfo(labelnames)
X_train = df_train.drop(labelnames, axis=1).values
X_test = df_test.drop(labelnames, axis=1).values
return X_train, y_train, X_test, y_test


## Confirm the shape of the data ¶

In [9]:

X_train, y_train, X_test, y_test = \
convert_data(df_train, df_test, n_id_train, n_id_test, continuous_dicts)
print("X_train:{} y_train:{} X_test:{} y_test:{}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

labelname:[' <=50K', ' >50K']
label0: <=50K
label1: >50K
X_train:(30162, 104) y_train:(30162, 2) X_test:(15060, 104) y_test:(15060, 2)


## Model definition ¶

In [10]:

sequential = rm.Sequential([
rm.Dense(128),
rm.Relu(),
rm.Dense(64),
rm.Relu(),
rm.Dense(2)
])


## Training Loop ¶

We make a random index using the function numpy.random.permutation, and construct batch data. The variable "l" contains the loss of the error function, "l.grad()" differentiates its parameter and updates the parameters using the function "grad.update(optimizer)".

In [11]:

batch_size = 128
epoch = 500
N = len(X_train)
optimizer = Sgd(lr=0.001)
learning_curve = []
test_learning_curve = []

for i in range(epoch):
perm = np.random.permutation(N)
loss = 0
for j in range(0, N // batch_size):
train_batch = X_train[perm[j*batch_size : (j+1)*batch_size]]
response_batch = y_train[perm[j*batch_size : (j+1)*batch_size]]

with sequential.train():
l = rm.softmax_cross_entropy(sequential(train_batch), response_batch)
loss += l.as_ndarray()
train_loss = loss / (N // batch_size)

test_loss = rm.softmax_cross_entropy(sequential(X_test), y_test).as_ndarray()
test_learning_curve.append(test_loss)
learning_curve.append(train_loss)
if i % 50 == 0:
print("epoch:{:03d}, train_loss:{:.4f}, test_loss:{:.4f}".format(i, float(train_loss), float(test_loss)))

epoch:000, train_loss:0.6276, test_loss:0.5707
epoch:050, train_loss:0.3697, test_loss:0.3690
epoch:100, train_loss:0.3529, test_loss:0.3549
epoch:150, train_loss:0.3448, test_loss:0.3482
epoch:200, train_loss:0.3395, test_loss:0.3443
epoch:250, train_loss:0.3361, test_loss:0.3419
epoch:300, train_loss:0.3329, test_loss:0.3398
epoch:350, train_loss:0.3301, test_loss:0.3375
epoch:400, train_loss:0.3277, test_loss:0.3358
epoch:450, train_loss:0.3248, test_loss:0.3339


## Prediction and Evaluation ¶

In [12]:

predictions = np.argmax(sequential(X_test).as_ndarray(), axis=1)
ans_train = np.array([list(row).index(1.0) for row in y_train])
ans_test = np.array([list(row).index(1.0) for row in y_test])

print(confusion_matrix(ans_test, predictions))
print(classification_report(ans_test, predictions))

plt.plot(learning_curve, linewidth=1, label="train")
plt.plot(test_learning_curve, linewidth=1, label="test")
plt.title("learning_curve")
plt.ylabel("error")
plt.xlabel("epoch")
plt.legend()
plt.grid()
plt.show()

[[10461   899]
[ 1465  2235]]
precision    recall  f1-score   support

0       0.88      0.92      0.90     11360
1       0.71      0.60      0.65      3700

avg / total       0.84      0.84      0.84     15060