# Annual Income Classification ¶

The annual income classification by using adult dataset.

This dataset is used for prediction that the one's annual income is greater than 50K or not. We will construct the neural network model.
We describe details bellow.

The reference of the dataset is below.

Adult Data Set, Ronny Kohavi and Barry Becker, Data Mining and Visualization, Silicon Graphics.

## Required Libaries ¶

• matplotlib 2.0.2
• numpy 1.12.1
• scikit-learn 0.18.2
In [1]:

from __future__ import division, print_function
import numpy as np
import pandas as pd

import renom as rm
from renom.optimizer import Sgd, Adam
from renom.cuda import set_cuda_active

from sklearn.preprocessing import LabelBinarizer, label_binarize
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
# If you would like to use GPU, set True, otherwise you should be set to False.
set_cuda_active(False)


## Import column infos ¶

Import column names and return the column names and the dictionary which column is numeric column or not.

In [2]:

def make_col_names():
col_names = []
continuous_dict = {}
for i,line in enumerate(open("adult.names","r"),1):
if i > 96:
line = line.rstrip()
name = line.split(":")[0]
col_names.append(name)
line = line.replace(" ","").replace(".","")
continuous = line.split(":")[1] == "continuous"
continuous_dict[name] = continuous
col_names.append("label")
continuous_dict["label"] = False
return col_names, continuous_dict


## Get the column names and if whether column is continuous ¶

In [3]:

n_id = 0
col_names, continuous_dicts = make_col_names()


## Load the data and make Index ¶

We show the some information about missing values and some preprocessing.

In [4]:

def load_data(filename, col_names, n):
df = pd.read_csv(filename, header=None, index_col=None)
# Display the number of records before delete missing valeus.
print("the number of {} records:{}\n".format(filename, len(df.index)))
df.columns = col_names

# Replace the missing value's character to np.nan.
df = df.applymap(lambda d: np.nan if d==" ?" else d)

# Unify the different written forms.
df = df.applymap(lambda d: " <=50K" if d==" <=50K." else d)
df = df.applymap(lambda d: " >50K" if d==" >50K." else d)

# Display the information about missing values and
print("missing value info:\n{}\n".format(df.isnull().sum(axis=0)))
df = df.dropna(axis=0)

# the number of records after delete missing valeus.
print("the number of {} records after trimming:{}\n".format(filename, len(df.index)))
ids = list(np.arange(n, n+len(df.index)))
df["ID"] = np.array(ids)
n = n+len(df.index)
return df,n


## Load the data and some preprocessing for pandas data frame ¶

The adult.data is for the training, the adult.test is for the prediction.

Note that the first line of "adult.test" is redundant. Use a text exditor and delete the first line of "adult.test" before going any further.

In [5]:

df_train,n_id_train = load_data("adult.data", col_names, n_id)
df_test,n_id_test = load_data("adult.test", col_names, n_id_train)

the number of adult.data records:32561

missing value info:
age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
label                0
dtype: int64

the number of adult.data records after trimming:30162

the number of adult.test records:16281

missing value info:
age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
label               0
dtype: int64

the number of adult.test records after trimming:15060



## Get not continious columns ¶

In [6]:

def get_not_continuous_columns(continuous_dict):
categorical_names = [k for k, v in continuous_dict.items() if not v]
return categorical_names


## Display label information ¶

In [7]:

def print_labelinfo(labelnames):
for i in range(len(labelnames)):
print("label{}:{}".format(i,labelnames[i]))


## Convert the data from pandas data frame to numpy data array ¶

We convert the data from the categorical data to one hot vector representation.

In [8]:

def convert_data(df_train, df_test, n_id_train, n_id_test, continuous_dicts):
categorical_names = get_not_continuous_columns(continuous_dicts)
df = pd.concat((df_train, df_test), axis=0)

# Get the dummy for the categorical data.
for name in categorical_names:
if name=="label":
labelnames = list(pd.get_dummies(df[name]).columns)
print("labelname:{}".format(labelnames))
dummy_df = pd.get_dummies(df[name])
df = pd.concat((df, dummy_df), axis=1)
df = df.drop(name, axis=1)

# Convert the data type.
for name in df.columns:
df[name] = df[name].astype(float)

# Reguralize the data.
for name in df.columns:
if name=="ID":
df[name] = df[name]
else:
df[name] = (df[name] - df[name].min()) / (df[name].max() - df[name].min())

df_train = df[df["ID"]<n_id_train].drop("ID", axis=1)
df_test = df[df["ID"]>=n_id_train].drop("ID", axis=1)

y_train = df_train[labelnames].values
y_test = df_test[labelnames].values
print_labelinfo(labelnames)
X_train = df_train.drop(labelnames, axis=1).values
X_test = df_test.drop(labelnames, axis=1).values
return X_train, y_train, X_test, y_test


## Confirm the shape of the data ¶

In [9]:

X_train, y_train, X_test, y_test = \
convert_data(df_train, df_test, n_id_train, n_id_test, continuous_dicts)
print("X_train:{} y_train:{} X_test:{} y_test:{}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

labelname:[' <=50K', ' >50K']
label0: <=50K
label1: >50K
X_train:(30162, 104) y_train:(30162, 2) X_test:(15060, 104) y_test:(15060, 2)


## Model definition ¶

In [10]:

sequential = rm.Sequential([
rm.Dense(128),
rm.Relu(),
rm.Dense(64),
rm.Relu(),
rm.Dense(2)
])


## Training Loop ¶

We make random index using the function numpy.random.permutation, and construct batch data. The variable "l" has the loss of the error function, "l.grad()" differentiate its parameter and update the parameters using the function "grad.update(optimizer)".

In [11]:

batch_size = 128
epoch = 500
N = len(X_train)
optimizer = Sgd(lr=0.001)
learning_curve = []
test_learning_curve = []

for i in range(epoch):
perm = np.random.permutation(N)
loss = 0
for j in range(0, N // batch_size):
train_batch = X_train[perm[j*batch_size : (j+1)*batch_size]]
response_batch = y_train[perm[j*batch_size : (j+1)*batch_size]]

with sequential.train():
l = rm.softmax_cross_entropy(sequential(train_batch), response_batch)
grad = l.grad()
grad.update(optimizer)
loss += l.as_ndarray()
train_loss = loss / (N // batch_size)

test_loss = rm.softmax_cross_entropy(sequential(X_test), y_test).as_ndarray()
test_learning_curve.append(test_loss)
learning_curve.append(train_loss)
if i % 50 == 0:
print("epoch:{:03d}, train_loss:{:.4f}, test_loss:{:.4f}".format(i, float(train_loss), float(test_loss)))

epoch:000, train_loss:0.6423, test_loss:0.5861
epoch:050, train_loss:0.3709, test_loss:0.3707
epoch:100, train_loss:0.3548, test_loss:0.3568
epoch:150, train_loss:0.3472, test_loss:0.3508
epoch:200, train_loss:0.3427, test_loss:0.3468
epoch:250, train_loss:0.3395, test_loss:0.3441
epoch:300, train_loss:0.3365, test_loss:0.3420
epoch:350, train_loss:0.3337, test_loss:0.3401
epoch:400, train_loss:0.3315, test_loss:0.3385
epoch:450, train_loss:0.3290, test_loss:0.3368


## Prediction and Evaluation ¶

In [12]:

predictions = np.argmax(sequential(X_test).as_ndarray(), axis=1)
ans_train = np.array([list(row).index(1.0) for row in y_train])
ans_test = np.array([list(row).index(1.0) for row in y_test])

print(confusion_matrix(ans_test, predictions))
print(classification_report(ans_test, predictions))

plt.plot(learning_curve, linewidth=1, label="train")
plt.plot(test_learning_curve, linewidth=1, label="test")
plt.title("learning_curve")
plt.ylabel("error")
plt.xlabel("epoch")
plt.legend()
plt.grid()
plt.show()

[[10512   848]
[ 1539  2161]]
precision    recall  f1-score   support

0       0.87      0.93      0.90     11360
1       0.72      0.58      0.64      3700

avg / total       0.83      0.84      0.84     15060



Confusion Matrix and Classification Report has a lot of information about evaluation.

The training error and the test error declined until 1000 epoch, but test error tends to go up slightly.
So, we have to adjust the learning rate and confirm the behavior when we change the learning rate.
We examine it on the other tutorial.