图来源:莱顿大学的数字收藏网站
人脸检测(幸好背景中的人物的面孔没有被检测到)
from imutils.face_utils import FaceAligner
from dlib import rectangle
import face_recognition
import imutils
import dlib
import cv2
import os
def resize(img):
resized = cv2.resize(img, (0,0), fx=2.5, fy=2.5)
return resized
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat") #path to the dlib model shape_predictor_68_face_landmarks.dat
fa = FaceAligner(predictor, desiredFaceWidth=224)
images = os.listdir('Posters')
for i, index in enumerate(images):
print(index, ':', i, '/', len(images))
image_path = 'Posters\\' + index + '\\' + index + '.jpg'
img = cv2.imread(image_path)
img = resize(img)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
face_locations = face_recognition.face_locations(img, number_of_times_to_upsample=0)
for j, face_location in enumerate(face_locations):
top, right, bottom, left = face_location
dlib_rect = rectangle(left, top, right, bottom) #convert to dlib rect object
faceOrig = imutils.resize(img[top:bottom, left:right], width=224)
faceAligned = fa.align(img, gray, dlib_rect)
cv2.imwrite('faces/' + index + '_' + str(j) + '.jpg', faceAligned)
朝鲜人的面部
import cv2
import os
import matplotlib.pyplot as plt
images = os.listdir('faces')
def smaller(img):
resized = cv2.resize(img, (0,0), fx=1/2, fy=1/2)
return resized
plt.axis('off')
for index, image in enumerate(images):
img = cv2.imread('faces/' + image)
img = smaller(img)
plt.imshow(cv2.cvtColor((img), cv2.COLOR_BGR2RGB))
plt.show(block=False)
category = ''
while category not in ['m', 'f', 'o']:
print(index, '/', len(images))
print(image)
category = input('Category ? ')
if index % 4 == 0:
path = 'data7/test'
else:
path = 'data7/train'
if category == 'm':
cv2.imwrite(path + '/male/' + image, img)
elif category == 'f':
cv2.imwrite(path + '/female/' + image, img)
elif category == 'o':
cv2.imwrite('data7/other/' + image, img)
手动标注数据集
VGG-16模型架构,VGG Face基于此架构
import os
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras_vggface.vggface import VGGFace
from inspect import getsourcefile
from os.path import abspath
from generator import Generator
path = os.path.dirname(abspath(getsourcefile(lambda:0)))
categories = os.listdir(os.path.join(path, 'data7/test'))
y_train = []
X_train = []
y_test = []
X_test = []
batch_size = 10
print('Getting samples')
for i, category in enumerate(categories):
print(category)
samples = os.listdir(os.path.join(path, 'data7/test', category))
for sample in samples:
X_test.append(os.path.join(path, 'data7/test', category, sample))
y_test.append(i)
for i, category in enumerate(categories):
print(category)
samples = os.listdir(os.path.join(path, 'data7/train', category))
for sample in samples:
X_train.append(os.path.join(path, 'data7/train', category, sample))
y_train.append(i)
training_generator = Generator(width = 224, height = 224, channels = 3, batch_size = batch_size).generate(X_train, y_train, return_labels = False, shuffle = False)
testing_generator = Generator(width = 224, height = 224, channels = 3, batch_size = batch_size).generate(X_test, y_test, return_labels = False, shuffle = False)
def save_features():
model = VGGFace(include_top=False, input_shape=(224, 224, 3), weights='vggface', pooling = 'avg')
bottleneck_features_train = model.predict_generator(training_generator, len(X_train) // batch_size)
np.save(open('bottleneck_features_train.npy', 'wb'), bottleneck_features_train)
bottleneck_features_test = model.predict_generator(testing_generator, len(X_test) // batch_size)
np.save(open('bottleneck_features_testing.npy', 'wb'), bottleneck_features_test)
def create_model():
train_data = np.load(open('bottleneck_features_train.npy', 'rb'))
train_labels = np.array([0] * 80 + [1] * 140)
train_labels = train_labels[:len(train_labels) - (len(train_labels) % batch_size)]
validation_data = np.load(open('bottleneck_features_testing.npy', 'rb'))
validation_labels = np.array([0] * 30 + [1] * 40)
validation_labels = validation_labels[:len(validation_labels) - (len(validation_labels) % batch_size)]
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=train_data.shape[1:]))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_data, train_labels,
epochs=100,
batch_size=batch_size,
validation_data=(validation_data, validation_labels))
score = model.evaluate(validation_data, validation_labels)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print("Baseline Error: %.2f%%" % (100-score[1]*100))
Epoch 95/100
220/220 [==============================] – 0s – loss: 0.0091 – acc: 0.9955 – val_loss: 1.7121 – val_acc: 0.8714
Epoch 96/100
220/220 [==============================] – 0s – loss: 0.0328 – acc: 0.9909 – val_loss: 0.7962 – val_acc: 0.9143
Epoch 97/100
220/220 [==============================] – 0s – loss: 0.1059 – acc: 0.9727 – val_loss: 0.8883 – val_acc: 0.9000
Epoch 98/100
220/220 [==============================] – 0s – loss: 0.0445 – acc: 0.9864 – val_loss: 0.7507 – val_acc: 0.9143
Epoch 99/100
220/220 [==============================] – 0s – loss: 0.0226 – acc: 0.9864 – val_loss: 0.8338 – val_acc: 0.9143
Epoch 100/100
220/220 [==============================] – 0s – loss: 0.0279 – acc: 0.9909 – val_loss: 0.8331 – val_acc: 0.9429
32/70 [============>……………..] – ETA: 0sTest loss: 0.833102767808
Test accuracy: 0.942857142857
Baseline Error: 5.71%
性别分类
男女面部数量分布
海报中塑造的人物性别分布
在混合人物的海报中性别分布
def extract_info(desc, gender, gender_dict):
doc = nlp(desc)
indices = [i for i, x in enumerate(doc) if x.lemma_ == gender]
for index in indices:
s = ''
i = index + 1
gender_dict['total'] += 1
while ((i < len(doc)) and
(doc[i].pos_ in filters)):
s += doc[i].lemma_ + ' '
print('s', s)
i += 1
if s != '':
gender_dict[s.strip()] += 1
一名男工人在打扫他的机器,而一名女工人正在擦窗户
然而,使用数据和统计可以使一个人在处理表示问题时能够做出更鲁棒的断言。从大量数据中提取信息不仅可以加强论证,而且还有助于在进行进一步研究之前进行快速的假设测试。简单的数据处理技术和数据可视化,允许一个人快速检测兴趣点,并将那些似乎不支持数据的假设撤出。