Max Coding blog

基於深度學習進行貓與狗的辨認(Pytorch)

2021/08/29

這次會介紹我在Kaggle上進行的貓狗辨認程式,我最終的測試結果得準確率可以高達90%,此部份會使用Pytorch這個深度學習框架。

1. 準備資料

在進行訓練前需要先準備好資料,我們先從Kaggle上下載圖片下來,再做資料的分配,這邊使用貓1000張及狗1000張當作訓練資料,使用貓500張及狗500張當作驗證資料,最後使用100張貓狗照片當作測試資料,我不使用Kaggle給的所有照片,因為我的電腦要跑很久,所以這邊取比較少張照片做訓練。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import shutil

train_all_path = "/home/chisc/workspace/wuzhenrong"

train_dir = "/home/chisc/workspace/wuzhenrong/train"
validation_dir = "/home/chisc/workspace/wuzhenrong/validation"
test_dir = "/home/chisc/workspace/wuzhenrong/test"

train_cat = "/home/chisc/workspace/wuzhenrong/train/cat"
train_dog = "/home/chisc/workspace/wuzhenrong/train/dog"

val_cat = "/home/chisc/workspace/wuzhenrong/validation/cat"
val_dog = "/home/chisc/workspace/wuzhenrong/validation/dog"

test_cat = "/home/chisc/workspace/wuzhenrong/test/cat"
test_dog = "/home/chisc/workspace/wuzhenrong/test/dog"

if not os.path.exists(train_dir):
os.mkdir(train_dir)
if not os.path.exists(validation_dir):
os.mkdir(validation_dir)
if not os.path.exists(test_dir):
os.mkdir(test_dir)

if not os.path.exists(train_cat):
os.mkdir(train_cat)
if not os.path.exists(train_dog):
os.mkdir(train_dog)

if not os.path.exists(val_cat):
os.mkdir(val_cat)
if not os.path.exists(val_dog):
os.mkdir(val_dog)

if not os.path.exists(test_cat):
os.mkdir(test_cat)
if not os.path.exists(test_dog):
os.mkdir(test_dog)

for i in range(0, 2000):
addr = f"/home/chisc/workspace/wuzhenrong/train_all/cat.{i}.jpg"
to_add = f"/home/chisc/workspace/wuzhenrong/train/cat/cat.{i}.jpg"
shutil.copyfile(addr, to_add)
for i in range(0, 2000):
addr = f"/home/chisc/workspace/wuzhenrong/train_all/dog.{i}.jpg"
to_add = f"/home/chisc/workspace/wuzhenrong/train/dog/dog.{i}.jpg"
shutil.copyfile(addr, to_add)


for i in range(2000, 2500):
addr = f"/home/chisc/workspace/wuzhenrong/train_all/cat.{i}.jpg"
to_add = f"/home/chisc/workspace/wuzhenrong/validation/cat/cat.{i}.jpg"
shutil.copyfile(addr, to_add)
for i in range(2000, 2500):
addr = f"/home/chisc/workspace/wuzhenrong/train_all/dog.{i}.jpg"
to_add = f"/home/chisc/workspace/wuzhenrong/validation/dog/dog.{i}.jpg"
shutil.copyfile(addr, to_add)


for i in range(2500, 3000):
addr = f"/home/chisc/workspace/wuzhenrong/train_all/cat.{i}.jpg"
to_add = f"/home/chisc/workspace/wuzhenrong/test/cat/cat.{i}.jpg"
shutil.copyfile(addr, to_add)
for i in range(2500, 3000):
addr = f"/home/chisc/workspace/wuzhenrong/train_all/dog.{i}.jpg"
to_add = f"/home/chisc/workspace/wuzhenrong/test/cat/dog.{i}.jpg"
shutil.copyfile(addr, to_add)
首先會先判斷目錄使否存在,如果不存在會自動建立一個,再來就是分配資料,這邊使用shutil.copyfile()來複製資料,第一個參數是資料來源地,第二個參數是目的地。

2. 引入函式庫

1
2
3
4
5
6
7
8
9
10
11
12
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from PIL import Image
from torchvision.datasets import DatasetFolder
import torchvision
from tqdm.notebook import tqdm as tqdm

為了畫函數圖形,所以引入matplotlib,接著numpy和pandas是常用的工具,所以提前引入以防不備之需,torch.nn裡包含很多神經網路的類別。再來是torchvision,引入後可以做資料的提取及準備。由於pytorch沒有訓練進度條,所以引入tqdm可以顯示進度條。

3. 引入資料

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
train_trans = transforms.Compose(
[transforms.RandomHorizontalFlip(),
transforms.RandomRotation((-30, 30)),
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
# val_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
val_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
test_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

batch_size = 32

train_data = ImageFolder(train_path, transform = train_trans)
val_data = ImageFolder(val_path,transform = test_trans)
test_data = ImageFolder(test_path, transform = test_trans)

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True)
val_loader = DataLoader(val_data, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True)
test_loader = DataLoader(test_data, shuffle = True)

接著就是讀入資料,transforms.Compose可以放入data augmentation的資訊,ImageFolder是從目錄裡讀取資料,會依據不同資料夾來當作不同label,而DataLoader會彙整剛剛兩個的資訊。

4. 看圖片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
images, labels = next(iter(train_loader))
# After Normalize
for i in np.arange(3):
plt.figure(i)
plt.imshow(images[i].permute(1, 2, 0))
# plt.show()
# Before Normalize
for i in np.arange(3):
plt.figure(i)
# Our data are normalized, in order to watch our origin image, so we need to denormalize our data
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])
tmp = transforms.Normalize(-mean/std, 1/std)(images[i]) # denormalize
plt.imshow(tmp.permute(1, 2, 0))
plt.show()

這邊的程式可以看到貓與狗的圖片,由於我們的train data有做normalize,所以要做denormalize,才能看到原圖。

5. CNN架構

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class CatDpg(nn.Module):
def __init__(self):
super(CatDpg, self).__init__()
self.cnn = nn.Sequential(
## CNN1
nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 3, stride = 1, padding = 1), # padding = kernel_size / 2
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2),## (64, 112, 112)
## CNN2
nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2),## (128, 56, 56)
## CNN3
nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2),## (256, 28, 28)
## CNN4
nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2),## (512, 14, 14)
## CNN5
nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2)## (512, 7, 7)
)
self.fc = nn.Sequential(
nn.Linear(512 * 7 * 7, 1024), # Fully-connected layer
nn.Dropout(0.4), # Avoid overfitting
nn.ReLU(),
nn.Linear(1024, 1024),
nn.Dropout(0.5),
nn.ReLU(),
nn.Linear(1024, 2)
)
# forward propagation
def forward(self, x):
x = self.cnn(x)
x = x.flatten(1)
x = self.fc(x)
return x

CNN的架構如下: 1. Input layer 2. Convolutional layer 3. ReLU layer 4. Pooling layer 5. Fully-connected layer 我們首先先建立卷積層,再一層激勵函數,然後再來一個池化層,記住padding等於kernel_size / 2,這樣做5層即可,然後在forward李需要加入flatten(),這樣才能做fully-connected。

6. 開始訓練

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
device = "cuda" if train_on_gpu else "cpu"
model = CatDpg()

model = model.to(device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
loss_func = nn.CrossEntropyLoss()

n_epochs = 30
train_loss_record = []
train_acc_record = []
val_loss_record = []
val_acc_record = []

for epoch in range(n_epochs):
train_loss = 0.0
val_loss = 0.0
train_acc = 0.0
val_acc = 0.0
model.train()

for x, y in tqdm(train_loader):
x, y = x.to(device), y.to(device)
prediction = model(x)
loss = loss_func(prediction, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
acc = ((prediction.argmax(dim = 1) == y).float().mean())
train_acc += acc/len(train_loader)
train_loss += loss/len(train_loader)

print(f"[ Train | {epoch+1}/{n_epochs} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")
train_loss_record.append(train_loss)
train_acc_record.append(train_acc)
# with torch.no_grad():
for x, y in tqdm(val_loader):
x, y = x.to(device), y.to(device)
prediction = model(x)
loss = loss_func(prediction, y)
loss.backward()
acc = ((prediction.argmax(dim = 1) == y).float().mean())
val_acc += acc/len(val_loader)
val_loss += loss/len(val_loader)
print(f"[ Validation | {epoch+1}/{n_epochs} ] loss = {val_loss:.5f}, acc = {val_acc:.5f}")
val_loss_record.append(val_loss)
val_acc_record.append(val_acc)
torch.save(model, 'catvsdog.pkl')

首先要判斷是否有CUDA,如果有就使用CUDA訓練,如果沒有,就用CPU訓練,我們這邊使用Adam當作optimizer,Adam相對SGD還要來的穩定,且沒有梯度消失及梯度爆炸的問題,loss function是使用cross entropy,接著進入訓練,記得訓練的的地方需要加入model.train()。

7. 查看模型效能

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
plt.figure(1)
plt.title('Training and Validation Loss')
train_l, = plt.plot(train_loss_record, color = 'red')
val_l, = plt.plot(val_loss_record, color = 'blue')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(handles = [train_l, val_l], labels = ['Training', 'Validation'], loc = 'best')
plt.show()

plt.figure(2)
plt.title('Training and Validation Accuracy')
train_a, = plt.plot(train_acc_record, color = 'red')
val_a, = plt.plot(val_acc_record, color = 'blue')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(handles = [train_a, val_a], labels = ['Training', 'Validation'], loc = 'best')
plt.show()

i = 0
for x, y in test_loader:
i += 1
if train_on_gpu:
x, y = x.cuda(), y.cuda()
output = model(x)
out = output.argmax(dim = 1)
out = out.to('cpu').numpy()
# print(out)
if i % 10 == 0:
plt.figure(i)
if out[0] == 0:
plt.title('Predict: cat')
else:
plt.title('Predict: dog')
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])
x = x.squeeze()
tmp = transforms.Normalize(-mean/std, 1/std)(x) # denormalize
tmp = tmp.to('cpu')
plt.imshow(tmp.permute(1, 2, 0))
plt.show()

最後把圖形輸出就完成了,而測試的最高準確率可以達到91%,而平均測試準確率是88%,

總程式碼

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from PIL import Image
from torchvision.datasets import DatasetFolder
import torchvision
from tqdm.notebook import tqdm as tqdm

train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
print('CUDA is not available.')
else:
print('CUDA is available!')

train_path = '/home/chisc/workspace/wuzhenrong/train'
val_path = '/home/chisc/workspace/wuzhenrong/validation/'
test_path = '/home/chisc/workspace/wuzhenrong/test/'

train_trans = transforms.Compose(
[transforms.RandomHorizontalFlip(),
transforms.RandomRotation((-30, 30)),
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
# val_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
val_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
test_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

batch_size = 32

train_data = ImageFolder(train_path, transform = train_trans)
val_data = ImageFolder(val_path,transform = test_trans)
test_data = ImageFolder(test_path, transform = test_trans)

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True)
val_loader = DataLoader(val_data, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True)
test_loader = DataLoader(test_data, shuffle = True)
print(train_loader)

images, labels = next(iter(train_loader))
# After Normalize
for i in np.arange(3):
plt.figure(i)
plt.imshow(images[i].permute(1, 2, 0))
# plt.show()
# Before Normalize
for i in np.arange(3):
plt.figure(i)
# Our data are normalized, in order to watch our origin image, so we need to denormalize our data
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])
tmp = transforms.Normalize(-mean/std, 1/std)(images[i]) # denormalize
plt.imshow(tmp.permute(1, 2, 0))
plt.show()

# 1. Input layer
# 2. Convolutional layer
# 3. ReLU layer
# 4. Pooling layer
# 5. Fully-connected layer
class CatDpg(nn.Module):
def __init__(self):
super(CatDpg, self).__init__()
self.cnn = nn.Sequential(
## CNN1
nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 3, stride = 1, padding = 1), # padding = kernel_size / 2
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2),## (64, 112, 112)
## CNN2
nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2),## (128, 56, 56)
## CNN3
nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2),## (256, 28, 28)
## CNN4
nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2),## (512, 14, 14)
## CNN5
nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2)## (512, 7, 7)
)
self.fc = nn.Sequential(
nn.Linear(512 * 7 * 7, 1024), # Fully-connected layer
nn.Dropout(0.4), # Avoid overfitting
nn.ReLU(),
nn.Linear(1024, 1024),
nn.Dropout(0.5),
nn.ReLU(),
nn.Linear(1024, 2)
)
# forward propagation
def forward(self, x):
x = self.cnn(x)
x = x.flatten(1)
x = self.fc(x)
return x

device = "cuda" if train_on_gpu else "cpu"
model = CatDpg()

model = model.to(device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
loss_func = nn.CrossEntropyLoss()

n_epochs = 30
train_loss_record = []
train_acc_record = []
val_loss_record = []
val_acc_record = []

for epoch in range(n_epochs):
train_loss = 0.0
val_loss = 0.0
train_acc = 0.0
val_acc = 0.0
model.train()

for x, y in tqdm(train_loader):
x, y = x.to(device), y.to(device)
prediction = model(x)
loss = loss_func(prediction, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
acc = ((prediction.argmax(dim = 1) == y).float().mean())
train_acc += acc/len(train_loader)
train_loss += loss/len(train_loader)

print(f"[ Train | {epoch+1}/{n_epochs} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")
train_loss_record.append(train_loss)
train_acc_record.append(train_acc)
# with torch.no_grad():
for x, y in tqdm(val_loader):
x, y = x.to(device), y.to(device)
prediction = model(x)
loss = loss_func(prediction, y)
loss.backward()
acc = ((prediction.argmax(dim = 1) == y).float().mean())
val_acc += acc/len(val_loader)
val_loss += loss/len(val_loader)
print(f"[ Validation | {epoch+1}/{n_epochs} ] loss = {val_loss:.5f}, acc = {val_acc:.5f}")
val_loss_record.append(val_loss)
val_acc_record.append(val_acc)
torch.save(model, 'catvsdog.pkl')

plt.figure(1)
plt.title('Training and Validation Loss')
train_l, = plt.plot(train_loss_record, color = 'red')
val_l, = plt.plot(val_loss_record, color = 'blue')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(handles = [train_l, val_l], labels = ['Training', 'Validation'], loc = 'best')
plt.show()

plt.figure(2)
plt.title('Training and Validation Accuracy')
train_a, = plt.plot(train_acc_record, color = 'red')
val_a, = plt.plot(val_acc_record, color = 'blue')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(handles = [train_a, val_a], labels = ['Training', 'Validation'], loc = 'best')
plt.show()

i = 0
for x, y in test_loader:
i += 1
if train_on_gpu:
x, y = x.cuda(), y.cuda()
output = model(x)
out = output.argmax(dim = 1)
out = out.to('cpu').numpy()
# print(out)
if i % 10 == 0:
plt.figure(i)
if out[0] == 0:
plt.title('Predict: cat')
else:
plt.title('Predict: dog')
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])
x = x.squeeze()
tmp = transforms.Normalize(-mean/std, 1/std)(x) # denormalize
tmp = tmp.to('cpu')
plt.imshow(tmp.permute(1, 2, 0))
plt.show()
by 中和高中 吳振榮
CATALOG
  1. 1. 1. 準備資料
  2. 2. 2. 引入函式庫
  3. 3. 3. 引入資料
  4. 4. 4. 看圖片
  5. 5. 5. CNN架構
  6. 6. 6. 開始訓練
  7. 7. 7. 查看模型效能
  8. 8. 總程式碼
    1. 8.0.0.1. by 中和高中 吳振榮