# 適合率と再現率 ¶

ニューラルネットワークを用いた分類モデルを構築します．

Confusion matrixとclassification reportは評価に関する多くの情報を内包しています．

• 適合率は予測の正解がどれくらいであったかという指標です．

• 一方で再現率は提示した正解がもともとのラベルの正解をどれだけ拾えたいたかの指標です．

• f1-scoreはその2つのバランスを取った指標になっています．

データのリファレンスは以下になります．

Adult Data Set, Ronny Kohavi and Barry Becker, Data Mining and Visualization, Silicon Graphics.

## Required Libraries ¶

• matplotlib 2.0.2
• numpy 1.12.1
• scikit-learn 0.18.2
• pandas 0.20.3
In [1]:

from __future__ import division, print_function
import numpy as np
import pandas as pd

import renom as rm
from renom.cuda import set_cuda_active

from sklearn.preprocessing import LabelBinarizer, label_binarize
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
# If you would like to use GPU computing, set this to True. Otherwise, False.
set_cuda_active(False)


## 列の情報を読み込みます． ¶

In [2]:

def make_col_names():
col_names = []
continuous_dict = {}
if i > 96:
line = line.rstrip()
name = line.split(":")[0]
col_names.append(name)
line = line.replace(" ","").replace(".","")
continuous = line.split(":")[1] == "continuous"
continuous_dict[name] = continuous
col_names.append("label")
continuous_dict["label"] = False
return col_names, continuous_dict


## 列の名前を取得し，数値データかどうかも取得します． ¶

In [3]:

n_id = 0
col_names, continuous_dicts = make_col_names()


## データを読み込んでインデックスを作成します． ¶

In [4]:

def load_data(filename, col_names, n, skiprows=None):
if skiprows:
else:

# Display the number of records before deleting missing values.
print("the number of {} records:{}\n".format(filename, len(df.index)))
df.columns = col_names

# Replace the missing value's character with np.nan.
df = df.applymap(lambda d: np.nan if d==" ?" else d)

# Unify the different written forms.
df = df.applymap(lambda d: " <=50K" if d==" <=50K." else d)
df = df.applymap(lambda d: " >50K" if d==" >50K." else d)

# Display information about missing values and
print("missing value info:\n{}\n".format(df.isnull().sum(axis=0)))
df = df.dropna(axis=0)

# the number of records after deleting missing valeus.
print("the number of {} records after trimming:{}\n".format(filename, len(df.index)))
ids = list(np.arange(n, n+len(df.index)))
df["ID"] = np.array(ids)
n = n+len(df.index)
return df,n


## データを読み込んでpandasのデータフレームに対していくつかの前処理を行います． ¶

In [5]:

df_train,n_id_train = load_data("adult.data", col_names, n_id)

the number of adult.data records:32561

missing value info:
age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
label                0
dtype: int64

the number of adult.data records after trimming:30162

missing value info:
age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
label               0
dtype: int64

the number of adult.test records after trimming:15060



## 数値データでないデータを取得 ¶

In [6]:

def get_not_continuous_columns(continuous_dict):
categorical_names = [k for k, v in continuous_dict.items() if not v]
return categorical_names


## ラベル情報を表示 ¶

In [7]:

def print_labelinfo(labelnames):
for i in range(len(labelnames)):
print("label{}:{}".format(i,labelnames[i]))


## pandasのデータフレームからnumpyのデータ配列に変換します． ¶

カテゴリデータをワンホットベクトル表現に変換します．

In [8]:

def convert_data(df_train, df_test, n_id_train, n_id_test, continuous_dicts):
categorical_names = get_not_continuous_columns(continuous_dicts)
df = pd.concat((df_train, df_test), axis=0)

# Get the dummy variables for the categorical data.
for name in categorical_names:
if name=="label":
labelnames = list(pd.get_dummies(df[name]).columns)
print("labelname:{}".format(labelnames))
dummy_df = pd.get_dummies(df[name])
df = pd.concat((df, dummy_df), axis=1)
df = df.drop(name, axis=1)

# Convert the data type.
for name in df.columns:
df[name] = df[name].astype(float)

# Reguralize the data.
for name in df.columns:
if name=="ID":
df[name] = df[name]
else:
df[name] = (df[name] - df[name].min()) / (df[name].max() - df[name].min())

df_train = df[df["ID"]<n_id_train].drop("ID", axis=1)
df_test = df[df["ID"]>=n_id_train].drop("ID", axis=1)

y_train = df_train[labelnames].values
y_test = df_test[labelnames].values
print_labelinfo(labelnames)
X_train = df_train.drop(labelnames, axis=1).values
X_test = df_test.drop(labelnames, axis=1).values
return X_train, y_train, X_test, y_test


## データのShapeを確認 ¶

In [9]:

X_train, y_train, X_test, y_test = \
convert_data(df_train, df_test, n_id_train, n_id_test, continuous_dicts)
print("X_train:{} y_train:{} X_test:{} y_test:{}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

labelname:[&apos; <=50K&apos;, &apos; >50K&apos;]
label0: <=50K
label1: >50K
X_train:(30162, 104) y_train:(30162, 2) X_test:(15060, 104) y_test:(15060, 2)


## モデルの定義 ¶

In [10]:

sequential = rm.Sequential([
rm.Dense(128),
rm.Relu(),
rm.Dense(64),
rm.Relu(),
rm.Dense(2)
])


## 学習ループ ¶

In [11]:

batch_size = 128
epoch = 500
N = len(X_train)
optimizer = Sgd(lr=0.001)
learning_curve = []
test_learning_curve = []

for i in range(epoch):
perm = np.random.permutation(N)
loss = 0
for j in range(0, N // batch_size):
train_batch = X_train[perm[j*batch_size : (j+1)*batch_size]]
response_batch = y_train[perm[j*batch_size : (j+1)*batch_size]]

with sequential.train():
l = rm.softmax_cross_entropy(sequential(train_batch), response_batch)
loss += l.as_ndarray()
train_loss = loss / (N // batch_size)

test_loss = rm.softmax_cross_entropy(sequential(X_test), y_test).as_ndarray()
test_learning_curve.append(test_loss)
learning_curve.append(train_loss)
if i % 50 == 0:
print("epoch:{:03d}, train_loss:{:.4f}, test_loss:{:.4f}".format(i, float(train_loss), float(test_loss)))

epoch:000, train_loss:0.6276, test_loss:0.5707
epoch:050, train_loss:0.3697, test_loss:0.3690
epoch:100, train_loss:0.3529, test_loss:0.3549
epoch:150, train_loss:0.3448, test_loss:0.3482
epoch:200, train_loss:0.3395, test_loss:0.3443
epoch:250, train_loss:0.3361, test_loss:0.3419
epoch:300, train_loss:0.3329, test_loss:0.3398
epoch:350, train_loss:0.3301, test_loss:0.3375
epoch:400, train_loss:0.3277, test_loss:0.3358
epoch:450, train_loss:0.3248, test_loss:0.3339


## 予測と評価 ¶

In [12]:

predictions = np.argmax(sequential(X_test).as_ndarray(), axis=1)
ans_train = np.array([list(row).index(1.0) for row in y_train])
ans_test = np.array([list(row).index(1.0) for row in y_test])

print(confusion_matrix(ans_test, predictions))
print(classification_report(ans_test, predictions))

plt.plot(learning_curve, linewidth=1, label="train")
plt.plot(test_learning_curve, linewidth=1, label="test")
plt.title("learning_curve")
plt.ylabel("error")
plt.xlabel("epoch")
plt.legend()
plt.grid()
plt.show()

[[10461   899]
[ 1465  2235]]
precision    recall  f1-score   support

0       0.88      0.92      0.90     11360
1       0.71      0.60      0.65      3700

avg / total       0.84      0.84      0.84     15060