# 年収分類 ¶

adult datasetを用いた年収分類

adultデータセットを用いて年収分類のためのモデルの学習方法を提供します．このデータセットはある人の年収が50Kを超えるかどうかの予測に用いられます．
データの詳細を以下に示します．

Datasetのリファレンスは以下になります．

Adult Data Set, Ronny Kohavi and Barry Becker, Data Mining and Visualization, Silicon Graphics.

## Required Libaries ¶

• matplotlib 2.0.2
• numpy 1.12.1
• scikit-learn 0.18.2
In [1]:

from __future__ import division, print_function
import numpy as np
import pandas as pd

import renom as rm
from renom.optimizer import Sgd, Adam
from renom.cuda import set_cuda_active

from sklearn.preprocessing import LabelBinarizer, label_binarize
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
# If you would like to use GPU, set True, otherwise you should be set to False.
set_cuda_active(False)


## 列の情報を読み込みます． ¶

In [2]:

def make_col_names():
col_names = []
continuous_dict = {}
for i,line in enumerate(open("adult.names","r"),1):
if i > 96:
line = line.rstrip()
name = line.split(":")[0]
col_names.append(name)
line = line.replace(" ","").replace(".","")
continuous = line.split(":")[1] == "continuous"
continuous_dict[name] = continuous
col_names.append("label")
continuous_dict["label"] = False
return col_names, continuous_dict


## 列の名前を取得し，数値データかどうかも取得します． ¶

In [3]:

n_id = 0
col_names, continuous_dicts = make_col_names()


## データを読み込んでインデックスを作成します． ¶

In [4]:

def load_data(filename, col_names, n):
df = pd.read_csv(filename, header=None, index_col=None)
# Display the number of records before delete missing valeus.
print("the number of {} records:{}\n".format(filename, len(df.index)))
df.columns = col_names

# Replace the missing value's character to np.nan.
df = df.applymap(lambda d: np.nan if d==" ?" else d)

# Unify the different written forms.
df = df.applymap(lambda d: " <=50K" if d==" <=50K." else d)
df = df.applymap(lambda d: " >50K" if d==" >50K." else d)

# Display the information about missing values and
print("missing value info:\n{}\n".format(df.isnull().sum(axis=0)))
df = df.dropna(axis=0)

# the number of records after delete missing valeus.
print("the number of {} records after trimming:{}\n".format(filename, len(df.index)))
ids = list(np.arange(n, n+len(df.index)))
df["ID"] = np.array(ids)
n = n+len(df.index)
return df,n


## データを読み込んでpandasのデータフレームに対していくつかの前処理を行います． ¶

adult.dataは訓練用のデータ，adult.testは予測用のデータになります．

'adult.test'については最初の一行が不要です.テキストエディタ等で,不要な行を削除してください.

In [5]:

df_train,n_id_train = load_data("adult.data", col_names, n_id)
df_test,n_id_test = load_data("adult.test", col_names, n_id_train)

the number of adult.data records:32561

missing value info:
age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
label                0
dtype: int64

the number of adult.data records after trimming:30162

the number of adult.test records:16281

missing value info:
age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
label               0
dtype: int64

the number of adult.test records after trimming:15060



## 数値データでないデータを取得 ¶

In [6]:

def get_not_continuous_columns(continuous_dict):
categorical_names = [k for k, v in continuous_dict.items() if not v]
return categorical_names


## ラベル情報を表示 ¶

In [7]:

def print_labelinfo(labelnames):
for i in range(len(labelnames)):
print("label{}:{}".format(i,labelnames[i]))


## pandasのデータフレームからnumpyのデータ配列に変換します． ¶

カテゴリデータをワンホットベクトル表現に変換します． |image0|

In [8]:

def convert_data(df_train, df_test, n_id_train, n_id_test, continuous_dicts):
categorical_names = get_not_continuous_columns(continuous_dicts)
df = pd.concat((df_train, df_test), axis=0)

# Get the dummy for the categorical data.
for name in categorical_names:
if name=="label":
labelnames = list(pd.get_dummies(df[name]).columns)
print("labelname:{}".format(labelnames))
dummy_df = pd.get_dummies(df[name])
df = pd.concat((df, dummy_df), axis=1)
df = df.drop(name, axis=1)

# Convert the data type.
for name in df.columns:
df[name] = df[name].astype(float)

# Reguralize the data.
for name in df.columns:
if name=="ID":
df[name] = df[name]
else:
df[name] = (df[name] - df[name].min()) / (df[name].max() - df[name].min())

df_train = df[df["ID"]<n_id_train].drop("ID", axis=1)
df_test = df[df["ID"]>=n_id_train].drop("ID", axis=1)

y_train = df_train[labelnames].values
y_test = df_test[labelnames].values
print_labelinfo(labelnames)
X_train = df_train.drop(labelnames, axis=1).values
X_test = df_test.drop(labelnames, axis=1).values
return X_train, y_train, X_test, y_test


## データのShapeを確認 ¶

In [9]:

X_train, y_train, X_test, y_test = \
convert_data(df_train, df_test, n_id_train, n_id_test, continuous_dicts)
print("X_train:{} y_train:{} X_test:{} y_test:{}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

labelname:[' <=50K', ' >50K']
label0: <=50K
label1: >50K
X_train:(30162, 104) y_train:(30162, 2) X_test:(15060, 104) y_test:(15060, 2)


## モデルの定義 ¶

In [10]:

sequential = rm.Sequential([
rm.Dense(128),
rm.Relu(),
rm.Dense(64),
rm.Relu(),
rm.Dense(2)
])


## 学習ループ ¶

numpy.random.permutationを用いてランダムにインデックスを作成し，バッチデータを作成します．lは誤差関数のロスを持っており，l.grad() は自身のパラメータを微分し，grad.update(optimizer)によってパラメータをアップデートします．

In [11]:

batch_size = 128
epoch = 500
N = len(X_train)
optimizer = Sgd(lr=0.001)
learning_curve = []
test_learning_curve = []

for i in range(epoch):
perm = np.random.permutation(N)
loss = 0
for j in range(0, N // batch_size):
train_batch = X_train[perm[j*batch_size : (j+1)*batch_size]]
response_batch = y_train[perm[j*batch_size : (j+1)*batch_size]]

with sequential.train():
l = rm.softmax_cross_entropy(sequential(train_batch), response_batch)
grad = l.grad()
grad.update(optimizer)
loss += l.as_ndarray()
train_loss = loss / (N // batch_size)

test_loss = rm.softmax_cross_entropy(sequential(X_test), y_test).as_ndarray()
test_learning_curve.append(test_loss)
learning_curve.append(train_loss)
if i % 50 == 0:
print("epoch:{:03d}, train_loss:{:.4f}, test_loss:{:.4f}".format(i, float(train_loss), float(test_loss)))

epoch:000, train_loss:0.6423, test_loss:0.5861
epoch:050, train_loss:0.3709, test_loss:0.3707
epoch:100, train_loss:0.3548, test_loss:0.3568
epoch:150, train_loss:0.3472, test_loss:0.3508
epoch:200, train_loss:0.3427, test_loss:0.3468
epoch:250, train_loss:0.3395, test_loss:0.3441
epoch:300, train_loss:0.3365, test_loss:0.3420
epoch:350, train_loss:0.3337, test_loss:0.3401
epoch:400, train_loss:0.3315, test_loss:0.3385
epoch:450, train_loss:0.3290, test_loss:0.3368


## 予測と評価 ¶

In [12]:

predictions = np.argmax(sequential(X_test).as_ndarray(), axis=1)
ans_train = np.array([list(row).index(1.0) for row in y_train])
ans_test = np.array([list(row).index(1.0) for row in y_test])

print(confusion_matrix(ans_test, predictions))
print(classification_report(ans_test, predictions))

plt.plot(learning_curve, linewidth=1, label="train")
plt.plot(test_learning_curve, linewidth=1, label="test")
plt.title("learning_curve")
plt.ylabel("error")
plt.xlabel("epoch")
plt.legend()
plt.grid()
plt.show()

[[10512   848]
[ 1539  2161]]
precision    recall  f1-score   support

0       0.87      0.93      0.90     11360
1       0.72      0.58      0.64      3700

avg / total       0.83      0.84      0.84     15060



Confusion MatrixとClassification Reportは評価に関する多くの情報を持っています．

1000epochまでの訓練誤差とテスト誤差はだんだん下がっていますが，テスト誤差に関しては少しずつそれから上がっていっています．
よって私達は学習率を調整したり，学習率によってどのように学習曲線が異なるのかを知る必要があります．
そのチュートリアルについては別のチュートリアルで検証したいと思います．