Dacon Computer Vision(Classification)

대회 소개

딥러닝 모델을 이용하여 글자 속 숨겨진 숫자를 정확하게 분류해내는 대회

대회 주소 : https://www.dacon.io/competitions/official/235626/overview/

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

데이터 불러오기

train = pd.read_csv('train.csv', index_col = 0) # id column은 index column.
test = pd.read_csv('test.csv', index_col = 0)
sub = pd.read_csv('submission.csv',index_col = 0)
train.head()

	digit	letter	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	...	744	745	746	747	748	749	750	751	752	753	754	755	756	757	758	759	760	761	762	763	764	765	766	767	768	769	770	771	772	773	774	775	776	777	778	779	780	781	782	783
id
1	5	L	1	1	1	4	3	0	0	4	4	3	0	4	3	3	3	4	4	0	0	1	1	3	4	0	4	2	0	4	0	1	3	1	0	4	1	1	3	1	...	4	3	4	1	3	0	0	1	3	3	3	0	3	2	2	1	0	1	0	0	3	0	0	4	2	0	3	4	1	1	2	1	0	1	2	4	4	4	3	4
2	0	B	0	4	0	0	4	1	1	1	4	2	0	3	4	0	0	2	3	4	0	3	4	3	0	2	2	1	4	2	3	3	4	1	2	4	2	0	3	2	...	4	2	3	0	0	0	0	4	3	2	2	4	2	1	1	1	3	3	1	2	4	4	4	2	2	4	4	0	4	2	0	3	0	1	4	1	4	2	1	2
3	4	L	1	1	2	2	1	1	1	0	2	1	3	2	2	2	4	1	1	4	1	0	1	3	4	2	2	2	4	1	1	2	0	3	0	2	3	4	0	1	...	3	0	4	0	3	0	2	0	1	4	2	3	4	4	4	0	2	0	4	4	1	3	0	3	2	0	2	3	0	2	3	3	3	0	2	0	3	0	2	2
4	9	D	1	2	0	2	0	4	0	3	4	3	1	0	3	2	2	0	3	4	1	0	4	1	2	2	3	2	2	0	2	0	3	0	3	2	4	0	0	4	...	0	3	0	1	4	1	3	1	2	1	1	1	2	2	2	4	3	4	3	0	4	1	2	4	1	4	0	1	0	4	3	3	2	0	1	4	0	0	1	1
5	6	A	3	0	2	4	0	3	0	4	2	4	2	1	4	1	1	4	4	0	2	3	4	4	3	3	3	3	4	1	0	3	0	3	0	0	0	1	1	2	...	2	1	3	2	1	4	2	3	2	2	1	0	4	2	2	1	2	1	0	3	2	2	2	2	1	4	2	1	2	1	4	4	3	2	1	3	4	3	1	2

5 rows × 786 columns

test.head()

	letter	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	...	744	745	746	747	748	749	750	751	752	753	754	755	756	757	758	759	760	761	762	763	764	765	766	767	768	769	770	771	772	773	774	775	776	777	778	779	780	781	782	783
id
2049	L	0	4	0	2	4	2	3	1	0	0	1	0	1	3	4	4	0	0	2	4	4	1	3	3	2	2	4	1	0	1	2	2	1	2	2	1	4	0	4	...	1	3	1	1	3	3	4	1	3	1	2	4	1	2	0	3	1	2	4	0	2	1	2	4	1	1	3	2	1	0	2	0	4	2	2	4	3	4	1	4
2050	C	4	1	4	0	1	1	0	2	2	1	0	3	0	1	1	4	1	2	0	2	2	0	4	3	4	0	2	4	4	2	1	2	4	0	4	2	0	2	3	...	3	4	2	6	2	2	0	1	2	4	1	1	3	3	2	3	4	2	2	4	3	1	3	3	3	1	3	4	4	2	0	3	2	4	2	4	2	2	1	2
2051	S	0	4	0	1	3	2	3	0	2	1	2	0	1	0	3	0	1	4	3	0	0	3	0	4	1	0	3	2	0	4	1	2	0	0	1	3	0	2	1	...	0	4	4	3	4	1	4	2	3	4	1	2	0	2	2	3	3	1	1	4	1	2	4	0	0	0	0	2	3	2	1	3	2	0	3	2	3	0	1	4
2052	K	2	1	3	3	3	4	3	0	0	2	3	2	3	4	4	4	0	1	4	2	2	0	1	4	3	1	3	0	2	3	2	4	3	1	1	4	0	0	3	...	0	4	1	1	2	3	2	3	3	0	0	1	3	3	0	2	0	0	2	3	2	2	3	1	1	2	4	0	1	2	3	0	3	2	4	1	0	4	4	4
2053	W	1	0	1	1	2	2	1	4	1	1	4	3	4	1	2	1	4	3	3	4	0	4	4	2	0	0	0	0	3	4	0	1	4	2	2	2	1	4	4	...	4	1	3	2	1	2	1	4	4	1	2	3	2	4	2	1	4	3	4	3	0	1	0	1	1	2	1	1	0	2	4	3	1	4	0	2	1	2	3	4

5 rows × 785 columns

digit을 맞춰야 하는 대회입니다.

데이터 살피기

train.shape, test.shape, sub.shape 

1	`((2048, 786), (20480, 785), (20480, 1))`

데이터 shape 확인하기

print('digit :', train['digit'].unique(),'\n','num_digit :',train['digit'].nunique()) 
print('\n','letter : ', train['letter'].unique(),'\n','num_letter :', train['letter'].nunique()) 

digit : [5 0 4 9 6 8 1 3 2 7] 
 num_digit : 10

 letter :  ['L' 'B' 'D' 'A' 'C' 'Q' 'M' 'F' 'J' 'H' 'N' 'X' 'I' 'R' 'V' 'Y' 'T' 'S'
 'U' 'P' 'K' 'O' 'Z' 'G' 'E' 'W'] 
 num_letter : 26

0~9 숫자 10개
알파벳 26개

train.dtypes 

digit      int64
letter    object
0          int64
1          int64
2          int64
           ...  
779        int64
780        int64
781        int64
782        int64
783        int64
Length: 786, dtype: object

숫자 맞춰야 하기 때문에 digit을 category형으로 바꿔야 합니다.

train['digit'].value_counts() 

  233
  225
  212
  207
  205
  202
  197
  194
  191
  182
Name: digit, dtype: int64

목적변수의 분포를 파악합니다.
클래스간 어느정도 균형을 이루고 있음을 알수 있습니다.

데이터 시각화

fig , axes = plt.subplots(2,5)
fig.set_size_inches(20,8)

for index in range(0,10):
    img = np.array(train.iloc[index*3, 2:]).reshape(28, 28).astype(np.float)
    axes[index//5 , index%5].imshow(img)
    axes[index//5 , index%5].set_title('Letter : {}\nDigit : {}'.format(train['letter'][index*3+1],train['digit'][index*3+1]))
    axes[index//5,  index%5].axis('off')

글자 속에 숫자가 숨겨져 있습니다.

이미지 자세히 살펴보기

train_img = train.iloc[:,2:].values.reshape(-1, 28, 28, 1).astype(np.float) # 이미지 데이터만 추출

plt.subplot(1,2,2)
data = np.where(train_img>=140, train_img, 0)
plt.imshow(data[3].reshape(28,28),cmap = 'gray')
plt.title('Letter : {} , Digit : {}'.format(train['letter'][4], train['digit'][4]))
plt.axis('off')

1	`(-0.5, 27.5, 27.5, -0.5)`

D 속에 숨어있는 숫자 9
문자에 겹치는 숫자 부분만 표현된듯 합니다.

이미지에 Conv2D 적용하기

원본 이미지

plt.imshow(np.array(train_img[3].reshape(28,28)))
plt.title('letter: {} | digit: {}'.format( train['letter'][4],train['digit'][4]))
plt.axis('off')

1	`(-0.5, 27.5, 27.5, -0.5)`

Conv2D 적용 후

conv2d = Conv2D(64, (3,3), input_shape = (28,28,1))
conv2d_activation  = Conv2D(64,(3,3), activation = 'relu', input_shape = (28,28,1))

fig, axes = plt.subplots(8, 8)
fig.set_size_inches(16, 16)
for i in range(64):
    axes[i//8, i%8].imshow(conv2d(train_img)[3,:,:,i], cmap='gray')
    axes[i//8, i%8].axis('off')

activation 적용 후

fig, axes = plt.subplots(8, 8)
fig.set_size_inches(16, 16)
for i in range(64):
    axes[i//8, i%8].imshow(conv2d_activation(train_img)[3,:,:,i], cmap='gray')
    axes[i//8, i%8].axis('off')

MaxPooling2D 적용 후

fig, axes = plt.subplots(8, 8)
fig.set_size_inches(16, 16)
for i in range(64):
    axes[i//8, i%8].imshow(MaxPooling2D(2, 2)(conv2d(train_img))[3, :, :, i], cmap='gray')
    axes[i//8, i%8].axis('off')

Letter 부분이 지워지지 않고 그대로 학습됐습니다.
Letter 부분 픽셀을 제거하고 학습하는 실험을 해봤습니다.

Letter와 숫자가 겹치는 Pixel 값만 나타내보기

fig, axes = plt.subplots(2, 10)
fig.set_size_inches(20, 5)
print('Letter : {}\nDigit : {}'.format(train['letter'][2], train['digit'][2]))
for i in range(20):
  data = np.where(train_img>=i*5 +100, train_img, 0)
  axes[i//10, i%10].imshow(data[1].reshape(28,28))
  axes[i//10, i%10].set_title('pixel size : {}'.format(i*5 +100),fontsize = 12)
  axes[i//10, i%10].axis('off')

Letter : B
Digit : 0

Pixel값 약 140정도
많은 정보가 손실돼서 학습시 도움이 되지 않았습니다.

fig, axes = plt.subplots(2, 2)
axes[0,0].imshow(np.array(train_img[3].reshape(28,28))) # 일정 픽셀 이상의 값만 추출해서 모델 적용
axes[1,0].imshow(data[3].reshape(28,28))
axes[0,0].set_title(train['letter'][4])
axes[1,0].set_title(train['digit'][4])
axes[0,0].axis('off')
axes[1,0].axis('off')

axes[0,1].imshow(np.array(train_img[4].reshape(28,28)))
axes[1,1].imshow(data[4].reshape(28,28))
axes[0,1].set_title(train['letter'][5])
axes[1,1].set_title(train['digit'][5])
axes[0,1].axis('off')
axes[1,1].axis('off')

1	`(-0.5, 27.5, 27.5, -0.5)`

Pixel값이 140 이상인 경우 Conv2D 적용

plt.imshow(np.array(data[4].reshape(28,28)))
plt.title('letter: {} | digit: {}'.format( train['letter'][5],train['digit'][5]))
plt.axis('off')

1	`(-0.5, 27.5, 27.5, -0.5)`

fig, axes = plt.subplots(8, 8)
fig.set_size_inches(16, 16)
for i in range(64):
    axes[i//8, i%8].imshow(conv2d(data)[4,:,:,i], cmap='gray')
    axes[i//8, i%8].axis('off')

Data Augmentation

train셋이 2048개로 class 개수에 비해 적습니다.
데이터 증강이 필요할 것 같습니다.

train = pd.read_csv('train.csv', index_col = 0) # id column은 index column.
test = pd.read_csv('test.csv', index_col = 0)
sub = pd.read_csv('submission.csv',index_col = 0)

X = train.drop(['digit','letter'], axis = 1) # image
y = train['digit'] # Label

# 정규화
X = X / 255.0

X = X.values.reshape(-1,28,28,1)
y = pd.get_dummies(y.values)

train_img = train.iloc[:,2:].values.reshape(-1, 28, 28, 1).astype(np.float) # 이미지 데이터만 추출
data = np.where(train_img>=140, train_img, 0)

ImageDataGenerator로 Data Augmentation 후 모델 학습(모델 직접 구축)

def cnn_model():
  # 파일 읽기
  train = pd.read_csv('train.csv', index_col = 0) # id column은 index column.
  test = pd.read_csv('test.csv', index_col = 0)
  sub = pd.read_csv('submission.csv',index_col = 0)

  X = train.drop(['digit','letter'], axis = 1) # image
  y = train['digit'] # Label


  # 정규화
  X = X / 255.0

  X = (X.values).reshape(-1,28,28,1)
  y = pd.get_dummies(y.values)

  # train, valid 나누기
  X_train, X_val, Y_train, Y_val = train_test_split(X, y, test_size = 0.1, random_state=2)



  # 모델 생성
  model = Sequential([
                        Conv2D(512, (3,3), activation = 'relu', input_shape = (28,28,1)), 
                        #MaxPooling2D(),

                        Conv2D(256, (3,3), activation = 'relu'), 
                        #MaxPooling2D(),
                      
                        Conv2D(128, (3,3), activation = 'relu'), 
  
                        Conv2D(128, (3,3), activation = 'relu'), 
                        MaxPooling2D(),
                        
                        Conv2D(64, (3,3), activation = 'relu'), 
                        Conv2D(64, (3,3), activation = 'relu'), 

                        Flatten(),
                        Dropout(0.5),
                        Dense(512, activation='relu'),
                        Dense(10, activation = 'softmax')                    
  ])

  model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = 'acc')



  # Model Checkpoint
  check_path = 'check.ckpt'
  check_point = ModelCheckpoint(
      filepath = check_path,
      monitor = 'val_acc',
      verbose = 1,
      save_best_only = True,
      save_weights_only = True)
  
  # Data Augmentation
  datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images


  datagen.fit(X_train)

  batch_size = 16
  history = model.fit_generator(datagen.flow(X_train,Y_train.values, 
                                             batch_size=16),
                                              epochs = 100,
                                              validation_data = (X_val,Y_val.values),
                                              verbose = 1, 
                                              steps_per_epoch=X_train.shape[0] // batch_size  + 1,
                                              validation_steps = X_val.shape[0] // batch_size +1,
                                              callbacks = [check_point]
                                              )
                  
  
  model.load_weights(check_path)
  
  return model

if __name__ == '__main__':
    model = cnn_model()
    #model.save("mymodel.h5")

114/116 [============================>.] - ETA: 0s - loss: 0.2794 - acc: 0.9045
Epoch 00095: val_acc did not improve from 0.90244
116/116 [==============================] - 2s 15ms/step - loss: 0.2776 - acc: 0.9056 - val_loss: 0.3073 - val_acc: 0.8976
Epoch 96/100
113/116 [============================>.] - ETA: 0s - loss: 0.2397 - acc: 0.9181
Epoch 00096: val_acc did not improve from 0.90244
116/116 [==============================] - 2s 15ms/step - loss: 0.2440 - acc: 0.9164 - val_loss: 0.3861 - val_acc: 0.8683
Epoch 97/100
113/116 [============================>.] - ETA: 0s - loss: 0.2676 - acc: 0.9042
Epoch 00097: val_acc did not improve from 0.90244
116/116 [==============================] - 2s 15ms/step - loss: 0.2677 - acc: 0.9040 - val_loss: 0.2738 - val_acc: 0.8976
Epoch 98/100
114/116 [============================>.] - ETA: 0s - loss: 0.2470 - acc: 0.9122
Epoch 00098: val_acc did not improve from 0.90244
116/116 [==============================] - 2s 15ms/step - loss: 0.2465 - acc: 0.9121 - val_loss: 0.3565 - val_acc: 0.8780
Epoch 99/100
116/116 [==============================] - ETA: 0s - loss: 0.2602 - acc: 0.9126
Epoch 00099: val_acc did not improve from 0.90244
116/116 [==============================] - 2s 15ms/step - loss: 0.2602 - acc: 0.9126 - val_loss: 0.4582 - val_acc: 0.8634
Epoch 100/100
113/116 [============================>.] - ETA: 0s - loss: 0.2358 - acc: 0.9187
Epoch 00100: val_acc did not improve from 0.90244
116/116 [==============================] - 2s 15ms/step - loss: 0.2408 - acc: 0.9164 - val_loss: 0.5688 - val_acc: 0.8341

최종 제출 코드

Train, Valid dataset 만들기

# 데이터 불러오기
train = pd.read_csv('train.csv', index_col = 0) # id column은 index column.
test = pd.read_csv('test.csv', index_col = 0)
sub = pd.read_csv('submission.csv',index_col = 0)

# 기본 전처리
x = np.array(train.iloc[:,2:]).reshape(-1,28,28,1).astype(np.float)  # reshape(batch_size, width, height, channel)
y = pd.get_dummies(train['digit']).values  # target variable One Hot Encoding

# train, valid 셋 나누기
x_train, x_valid , y_train, y_valid = train_test_split(x, y, test_size = 0.1, random_state=1,
                                                       stratify=y)


x_train = x_train / 255.0
x_valid = x_valid / 255.0

모델 구축 및 학습

# model 
model = Sequential([
                      Conv2D(256, (3,3), activation = 'relu', input_shape = (28,28,1)), 
                      MaxPooling2D(),

                      Conv2D(256, (3,3), activation = 'relu'), 
                      #MaxPooling2D(),
                    
                      Conv2D(128, (3,3), activation = 'relu'), 
 
                      Conv2D(128, (3,3), activation = 'relu'), 
                      #MaxPooling2D(),

                      Flatten(),
                      Dropout(0.5),
                      Dense(512, activation='relu'),
                      Dense(10, activation = 'softmax')                    
])

# model compile
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = 'acc')

# model checkpoint
checkpath = 'ck.ckpt'
checkpoint = ModelCheckpoint(filepath = checkpath,
                             monitor = 'val_loss',
                             verbose =1,
                             save_best_only = True,
                             save_weights_only = True)



# model fit
history = model.fit(x_train, y_train,
                    validation_data = (x_valid, y_valid),
                    batch_size = 32,
                    epochs = 40,
                    callbacks = [checkpoint])

# model load weights
model.load_weights(checkpath)

학습 결과 시각화

Loss / Val_loss 시각화

# val_loss vs loss 시각화
plt.figure(figsize=(12, 9))
plt.plot(np.arange(1, 41), history.history['loss'])
plt.plot(np.arange(1, 41), history.history['val_loss'])
plt.title('Loss / Val Loss', fontsize=20)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['loss', 'val_loss'], fontsize=15)
plt.show()

Accuracy / Val_Accuracy 시각화

plt.figure(figsize=(12, 9))
plt.plot(np.arange(1, 41), history.history['acc'])
plt.plot(np.arange(1, 41), history.history['val_acc'])
plt.title('acc / val acc', fontsize=20)
plt.xlabel('Epochs')
plt.ylabel('acc')
plt.legend(['acc', 'val_acc'], fontsize=15)
plt.show()

Test Dataset 전처리

x_test = test.drop(['letter'], axis=1).values
x_test = x_test.reshape(-1, 28, 28, 1)
x_test = x_test/255

Test Dataset 예측 및 제출

sub['digit'] = np.argmax(model.predict(x_test), axis=1)
sub.to_csv('submission.csv')
sub.head()

	digit
id
2049	6
2050	8
2051	2
2052	0
2053	3

예측 결과 시각화

fig , axes = plt.subplots(2,5)
fig.set_size_inches(20,8)

for index in range(0,10):
    img = np.array(x_test[index*3]).reshape(28, 28).astype(np.float)
    axes[index//5 , index%5].imshow(img)
    axes[index//5 , index%5].set_title('Letter : {}\nDigit : {}'.format(test['letter'].values[index*3],sub['digit'].values[index*3]))
    axes[index//5,  index%5].axis('off')

Letter는 주어진 값이고, 모델은 Digit을 예측합니다.