- 一个APP收集每个手势的一些例子(画一些复选标记或者心形,等等)。
- 一些Python脚本用于训练机器学习算法(下面将会解释),以识别手势。我们将使用TensorFlow,稍后会讲到。
- 这款APP可以使用自定义手势。记录用户在屏幕上的动作,并使用机器学习算法来找出它们所代表的手势。
在计算机科学中,手势识别是通过数学算法来识别人类手势的一个议题。用户可以使用简单的手势来控制或与设备交互,让计算机理解人类的行为。
在屏幕上随便划动两下,手机就会对复杂的手势进行实时识别
我们所画的手势将用于训练机器学习算法,我们将用Core ML来评估应用内(in-app)的算法
首先,让我们确保我们的机器学习算法有一些数据(手势)来学习。为了生成一个真实的数据集,我编写了一个名为“GestureInput”的iOS应用,用于在设备上输入手势。它允许你输入大量的笔画,然后预览所生成的图像,并将其添加到数据集中。你还可以修改相关的类(称为标签)并且删除示例。
为机器学习算法生成数据
输出训练
GestureInput中的“Rasterize”按钮将用户画的图案转换为图像,并将其保存到一个名为data.trainingset的文件中。这些图像就是我们要输入的算法。
目前,最先进的图像分类机器学习算法是卷积神经网络(CNNs)。我们将用TensorFlow训练一个CNNs,并在我们的APP中使用它。
我的神经网络是基于“Deep MNIST for Experts”的TensorFlow教程所使用的。教程地址:https://www.tensorflow.org/get_started/mnist/pros
我用来训练和导出模型的一组脚本在一个叫做“gesturelearner”的文件夹中。文件夹地址:https://github.com/mitochrome/complex-gestures-demo/tree/master/gesturelearner。
我将讨论典型的用例,但是它们有一些额外的以virtualenv开头的命令行选项可能是有用的:
cd /path/to/gesturelearner
# Until coremltools supports Python 3, use Python 2.7.
virtualenv -p $(which python2.7) venv
pip install -r requirements.txt
准备数据集
首先,我使用filter.py将数据集分成15%的“测试集”和85%的“训练集”。
# Activate the virtualenv.
source /path/to/gesturelearner/venv/bin/activate
# Split the data set.
python /path/to/gesturelearner/filter.py --test-fraction=0.15
data.trainingset
训练
# Convert the generated files to the TensorFlow TFRecords format.
python /path/to/gesturelearner/convert_to_tfrecords.py
data_filtered.trainingset
python /path/to/gesturelearner/convert_to_tfrecords.py
data_filtered_test.trainingset
# Train the neural network.
python /path/to/gesturelearner/train.py --test-
file=data_filtered_test.tfrecords data_filtered.tfrecords
输出到Core ML
1 from coremltools.models import MLModel
2 from coremltools.models.neural_network import NeuralNetworkBuilder
3 import coremltools.models.datatypes as datatypes
4
5 # ...
6
7 def make_mlmodel(variables):
8 # Specify the inputs and outputs (there can be multiple).
9 # Each name corresponds to the input_name/output_name of a layer in the network so
10 # that Core ML knows where to insert and extract data.
11 input_features = [('image', datatypes.Array(1, IMAGE_HEIGHT, IMAGE_WIDTH))]
12 output_features = [('labelValues', datatypes.Array(NUM_LABEL_INDEXES))]
13 builder = NeuralNetworkBuilder(input_features, output_features, mode=None)
14
15 # The "name" parameter has no effect on the function of the network. As far as I know
16 # it's only used when Xcode fails to load your mlmodel and gives you an error telling
17 # you what the problem is.
18 # The input_names and output_name are used to link layers to each other and to the
19 # inputs and outputs of the model. When adding or removing layers, or renaming their
20 # outputs, always make sure you correct the input and output names of the layers
21 # before and after them.
22 builder.add_elementwise(name='add_layer',
23 input_names=['image'], output_name='add_layer', mode='ADD',
24 alpha=-0.5)
25
26 # Although Core ML internally uses weight matrices of shape
27 # (outputChannels, inputChannels, height, width) (as can be found by looking at the
28 # protobuf specification comments), add_convolution takes the shape
29 # (height, width, inputChannels, outputChannels) (as can be found in the coremltools
30 # documentation). The latter shape matches what TensorFlow uses so we don't need to
31 # reorder the matrix axes ourselves.
32 builder.add_convolution(name='conv2d_1', kernel_channels=1,
33 output_channels=32, height=3, width=3, stride_height=1,
34 stride_width=1, border_mode='same', groups=0,
35 W=variables['W_conv1'].eval(), b=variables['b_conv1'].eval(),
36 has_bias=True, is_deconv=False, output_shape=None,
37 input_name='add_layer', output_name='conv2d_1')
38
39 builder.add_activation(name='relu_1', non_linearity='RELU', input_name='conv2d_1',
40 output_name='relu_1', params=None)
41
42 builder.add_pooling(name='maxpool_1', height=2, width=2, stride_height=2,
43 stride_width=2, layer_type='MAX', padding_type='SAME',
44 input_name='relu_1', output_name='maxpool_1')
45
46 # ...
47
48 builder.add_flatten(name='maxpool_3_flat', mode=1, input_name='maxpool_3',
49 output_name='maxpool_3_flat')
50
51 # We must swap the axes of the weight matrix because add_inner_product takes the shape
52 # (outputChannels, inputChannels) whereas TensorFlow uses
53 # (inputChannels, outputChannels). Unlike with add_convolution (see the comment
54 # above), the shape add_inner_product expects matches what the protobuf specification
55 # requires for inner products.
56 builder.add_inner_product(name='fc1',
57 W=tf_fc_weights_order_to_mlmodel(variables['W_fc1'].eval())
58 .flatten(),
59 b=variables['b_fc1'].eval().flatten(),
60 input_channels=6*6*64, output_channels=1024, has_bias=True,
61 input_name='maxpool_3_flat', output_name='fc1')
62
63 # ...
64
65 builder.add_softmax(name='softmax', input_name='fc2', output_name='labelValues')
66
67 model = MLModel(builder.spec)
68
69 model.short_description = 'Model for recognizing a variety of images drawn on screen with one\'s finger'
70
71 model.input_description['image'] = 'A gesture image to classify'
72 model.output_description['labelValues'] = 'The "probability" of each label, in a dense array'
73
74 return model
# Save a Core ML .mlmodel file from the TensorFlow checkpoint
model.ckpt.
python /path/to/gesturelearner/save_mlmodel.py model.ckpt
GestureModel
, GestureModelInput
和GestureModelOutput这三个类
。1 /**
2 * Convert the `Drawing` into a binary image of format suitable for input to the
3 * GestureModel neural network.
4 *
5 * - returns: If successful, a valid input for GestureModel
6 */
7 func drawingToGestureModelFormat(_ drawing: Drawing) -> MLMultiArray? {
8 guard let image = drawing.rasterized(), let grays = imageToGrayscaleValues(image: image) else {
9 return nil
10 }
11
12 guard let array = try? MLMultiArray(
13 shape: [
14 1,
15 NSNumber(integerLiteral: Int(image.size.width)),
16 NSNumber(integerLiteral: Int(image.size.height))
17 ],
18 dataType: .double
19 ) else {
20 return nil
21 }
22
23 let doubleArray = array.dataPointer.bindMemory(to: Float64.self, capacity: array.count)
24
25 for i in 0 ..< array.count {
26 doubleArray.advanced(by: i).pointee = Float64(grays[i]) / 255.0
27 }
28
29 return array
30 }
1 /**
2 * Convert the `Drawing` into a grayscale image and use a neural network to compute
3 * values ("probabilities") for each gesture label.
4 *
5 * - returns: An array that has at each index `i` the value for
6 * `Touches_Label.all[i]`.
7 */
8 func predictLabel(drawing: Drawing) -> [Double]? {
9 // Convert the user's gesture ("drawing") into a fixed-size grayscale image.
10 guard let array = drawingToGestureModelFormat(drawing) else {
11 return nil
12 }
13
14 let model = GestureModel.shared
15
16 // The GestureModel convenience method prediction(image:) wraps our image in
17 // a GestureModelInput instance before passing that to prediction(input:).
18 // Both methods return a GestureModelOutput with our output in the
19 // labelValues property. The names "image" and "labelValues" come from the
20 // names we gave to the inputs and outputs of the .mlmodel when we saved it.
21 guard let labelValues = try? model.prediction(image: array).labelValues else {
22 return nil
23 }
24
25 // Convert the MLMultiArray labelValues into a normal array.
26 let dataPointer = labelValues.dataPointer.bindMemory(to: Double.self, capacity: labelValues.count)
27 return Array(UnsafeBufferPointer(start: dataPointer, count: labelValues.count))
28 }
为了减少冲突,我使用了两个简单的规则: