延續之前的作業,繼續往下寫XD~
- 分隔 dataset ,如果電腦不夠好的朋友請自行把最下面的dataset大小調小一點
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def make_arrays(nb_rows, img_size): | |
if nb_rows: | |
dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32) | |
labels = np.ndarray(nb_rows, dtype=np.int32) | |
else: | |
dataset, labels = None, None | |
return dataset, labels | |
def merge_datasets(pickle_files, train_size, valid_size=0): | |
num_classes = len(pickle_files) | |
valid_dataset, valid_labels = make_arrays(valid_size, image_size) | |
train_dataset, train_labels = make_arrays(train_size, image_size) | |
vsize_per_class = valid_size // num_classes | |
tsize_per_class = train_size // num_classes | |
start_v, start_t = 0, 0 | |
end_v, end_t = vsize_per_class, tsize_per_class | |
end_l = vsize_per_class+tsize_per_class | |
for label, pickle_file in enumerate(pickle_files): | |
try: | |
with open(pickle_file, 'rb') as f: | |
letter_set = pickle.load(f) | |
if valid_dataset is not None: | |
valid_letter = letter_set[:vsize_per_class, :, :] | |
valid_dataset[start_v:end_v, :, :] = valid_letter | |
valid_labels[start_v:end_v] = label | |
start_v += vsize_per_class | |
end_v += vsize_per_class | |
train_letter = letter_set[vsize_per_class:end_l, :, :] | |
train_dataset[start_t:end_t, :, :] = train_letter | |
train_labels[start_t:end_t] = label | |
start_t += tsize_per_class | |
end_t += tsize_per_class | |
except Exception as e: | |
print('Unable to process data from', pickle_file, ':', e) | |
raise | |
return valid_dataset, valid_labels, train_dataset, train_labels | |
train_size = 2000 | |
valid_size = 1000 | |
test_size = 1000 | |
valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(train_datasets, train_size, valid_size) | |
__, __, test_dataset, test_labels = merge_datasets(test_datasets, test_size) | |
print('Training:', train_dataset.shape, train_labels.shape) | |
print('Validation:', valid_dataset.shape, valid_labels.shape) | |
print('Testing:', test_dataset.shape, test_labels.shape) |
- 接著把 dataset 的資料random
- 接下來要用 logistic regression 分析這些圖片data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
np.random.seed(133) | |
def randomize(dataset, labels): | |
permutation = np.random.permutation(labels.shape[0]) | |
shuffled_dataset = dataset[permutation,:,:] | |
shuffled_labels = labels[permutation] | |
return shuffled_dataset, shuffled_labels | |
train_dataset, train_labels = randomize(train_dataset, train_labels) | |
test_dataset, test_labels = randomize(test_dataset, test_labels) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model = LogisticRegression() | |
model.fit(train_dataset.reshape(train_dataset.shape[0],-1), train_labels) | |
test_pred = model.predict(test_dataset.reshape(test_dataset.shape[0],-1)) | |
sum(test_labels == test_pred)/float(len(test_labels)) //檢驗一下猜中的比例 (sample 2000) | |
#0.709 | |
sum(test_labels == test_pred)/float(len(test_labels))//檢驗一下猜中的比例 (sample 20000) | |
#0.869 |
把圖片讀成矩陣後,要再用reshape把矩陣攤平才能放入logistic regression model.預測的時候也一樣把train data資料reshape後丟到裡面,就會跑出來結果.檢驗的方式很簡單,單純去對預測值跟實際值一樣的比例. 當sample資料量增加十倍(2000 -> 20000),預測準確增加了22.6%.
下次就會開始使用Tensor Flow了.
下次就會開始使用Tensor Flow了.
沒有留言:
張貼留言