背景:我的一位同事曾提到,他在面试深度学习相关职位中被问到一些关于优化算法的问题。我决定在本文中就优化算法做一个简短的介绍。
成本函数的最优化算法
目标函数是一种试图将一组参数最小化的函数。在机器学习中,目标函数通常被设定为一种度量,即预测值与实际值的相似程度。通常,我们希望找到一组会导致尽可能小的成本的参数,因为这就意味着你的算法会完成得很好。一个函数的最小成本可能就是最小值。有时,成本函数可以有多个局部最小值。幸运的是,在非常高维的参数空间中,保护目标函数的充分优化的局部极小值不会经常发生,因为这意味着在过程的早期,每个参数都为凹(concave)。这不是典型的情况,所以我们剩下了很多鞍点(saddle point),而不是真正的最小值。
随机梯度下降算法
在随机梯度下降算法中,你很可能会遇到这样的方程:
θ:作为山的位置
:作为角度θ大小的陡坡
α:作为
import numpy as np
def minimaFunction(theta):
return np.cos(3*np.pi*theta)/theta
def minimaFunctionDerivative(theta):
const1 = 3*np.pi
const2 = const1*theta
return -(const1*np.sin(const2)/theta)-np.cos(const2)/theta**2
theta = np.arange(.1,2.1,.01)
Jtheta = minimaFunction(theta)
dJtheta = minimaFunctionDerivative(theta)
plt.plot(theta,Jtheta,label = r'$J(\theta)$')
plt.plot(theta,dJtheta/30,label = r'$dJ(\theta)/30$')
plt.legend()
axes = plt.gca()
#axes.set_ylim([-10,10])
plt.ylabel(r'$J(\theta),dJ(\theta)/30$')
plt.xlabel(r'$\theta$')
plt.title(r'$J(\theta),dJ(\theta)/30 $ vs $\theta$')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
def optimize(iterations, oF, dOF,params,learningRate):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams = [params]
#The iteration loop
for i in range(iterations):
# Compute the derivative of the parameters
dParams = dOF(params)
# Compute the update
params = params-learningRate*dParams
# app end the new parameters
oParams.append(params)
return np.array(oParams)
def minimaFunction(theta):
return np.cos(3*np.pi*theta)/theta
def minimaFunctionDerivative(theta):
const1 = 3*np.pi
const2 = const1*theta
return -(const1*np.sin(const2)/theta)-np.cos(const2)/theta**2
theta = .6
iterations=45
learningRate = .0007
optimizedParameters = optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
theta,\
learningRate)
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.animation as animation
def minimaFunction(params):
#Bivariate Normal function
X,Y = params
sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)
Z1 = mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)
Z = Z1
return -40*Z
def minimaFunctionDerivative(params):
# Derivative of the bivariate normal function
X,Y = params
sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)
dZ1X = -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11 - X)/sigma11**2
dZ1Y = -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12 - Y)/sigma12**2
return (dZ1X,dZ1Y)
def optimize(iterations, oF, dOF,params,learningRate,beta):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
- beta - The weighted moving average parameter
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams = [params]
vdw = (0.0,0.0)
#The iteration loop
for i in range(iterations):
# Compute the derivative of the parameters
dParams = dOF(params)
#SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter
params = tuple([par-learningRate*dPar for dPar,par in zip(dParams,params)])
# append the new parameters
oParams.append(params)
return oParams
iterations=100
learningRate = 1
beta = .9
x,y = 4.0,1.0
params = (x,y)
optimizedParameters = optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
params,\
learningRate,\
beta)
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.animation as animation
def minimaFunction(params):
#Bivariate Normal function
X,Y = params
sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)
Z1 = mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)
Z = Z1
return -40*Z
def minimaFunctionDerivative(params):
# Derivative of the bivariate normal function
X,Y = params
sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)
dZ1X = -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11 - X)/sigma11**2
dZ1Y = -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12 - Y)/sigma12**2
return (dZ1X,dZ1Y)
def optimize(iterations, oF, dOF,params,learningRate,beta):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
- beta - The weighted moving average parameter for momentum
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams = [params]
vdw = (0.0,0.0)
#The iteration loop
for i in range(iterations):
# Compute the derivative of the parameters
dParams = dOF(params)
# Compute the momentum of each gradient vdw = vdw*beta+(1.0+beta)*dPar
vdw = tuple([vDW*beta+(1.0-beta)*dPar for dPar,vDW in zip(dParams,vdw)])
#SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter
params = tuple([par-learningRate*dPar for dPar,par in zip(vdw,params)])
# append the new parameters
oParams.append(params)
return oParams
iterations=100
learningRate = 5.3
beta = .9
x,y = 4.0,1.0
params = (x,y)
optimizedParameters = optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
params,\
learningRate,\
beta)
RMSProp算法
通过观察每个参数对每个参数的梯度相对大小,RMSProp算法尝试对Momentum函数进行改进。正因为如此,我们可以采取每个梯度的平方的加权指数移动平均,并按比例将梯度下降函数标准化。带有大梯度的参数将比带有小梯度的参数大得多,并允许平滑下降到最优值。这可以从下面的等式中看出:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.animation as animation
def minimaFunction(params):
#Bivariate Normal function
X,Y = params
sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)
Z1 = mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)
Z = Z1
return -40*Z
def minimaFunctionDerivative(params):
# Derivative of the bivariate normal function
X,Y = params
sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)
dZ1X = -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11 - X)/sigma11**2
dZ1Y = -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12 - Y)/sigma12**2
return (dZ1X,dZ1Y)
def optimize(iterations, oF, dOF,params,learningRate,beta):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
- beta - The weighted moving average parameter for RMSProp
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams = [params]
sdw = (0.0,0.0)
eps = 10**(-7)
#The iteration loop
for i in range(iterations):
# Compute the derivative of the parameters
dParams = dOF(params)
# Compute the momentum of each gradient sdw = sdw*beta+(1.0+beta)*dPar^2
sdw = tuple([sDW*beta+(1.0-beta)*dPar**2 for dPar,sDW in zip(dParams,sdw)])
#SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter
params = tuple([par-learningRate*dPar/((sDW**.5)+eps) for sDW,par,dPar in zip(sdw,params,dParams)])
# append the new parameters
oParams.append(params)
return oParams
iterations=10
learningRate = .3
beta = .9
x,y = 5.0,1.0
params = (x,y)
optimizedParameters = optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
params,\
learningRate,\
beta)
Adam算法
Adam算法将Momentum算法和RMSProp算法的概念结合到一种算法中,以获得两种算法的最佳特征。它的公式如下:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.animation as animation
def minimaFunction(params):
#Bivariate Normal function
X,Y = params
sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)
Z1 = mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)
Z = Z1
return -40*Z
def minimaFunctionDerivative(params):
# Derivative of the bivariate normal function
X,Y = params
sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)
dZ1X = -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11 - X)/sigma11**2
dZ1Y = -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12 - Y)/sigma12**2
return (dZ1X,dZ1Y)
def optimize(iterations, oF, dOF,params,learningRate,beta1,beta2):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
- beta1 - The weighted moving average parameter for momentum component of ADAM
- beta2 - The weighted moving average parameter for RMSProp component of ADAM
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams = [params]
vdw = (0.0,0.0)
sdw = (0.0,0.0)
vdwCorr = (0.0,0.0)
sdwCorr = (0.0,0.0)
eps = 10**(-7)
#The iteration loop
for i in range(iterations):
# Compute the derivative of the parameters
dParams = dOF(params)
# Compute the momentum of each gradient vdw = vdw*beta+(1.0+beta)*dPar
vdw = tuple([vDW*beta1+(1.0-beta1)*dPar for dPar,vDW in zip(dParams,vdw)])
# Compute the rms of each gradient sdw = sdw*beta+(1.0+beta)*dPar^2
sdw = tuple([sDW*beta2+(1.0-beta2)*dPar**2.0 for dPar,sDW in zip(dParams,sdw)])
# Compute the weight boosting for sdw and vdw
vdwCorr = tuple([vDW/(1.0-beta1**(i+1.0)) for vDW in vdw])
sdwCorr = tuple([sDW/(1.0-beta2**(i+1.0)) for sDW in sdw])
#SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter
params = tuple([par-learningRate*vdwCORR/((sdwCORR**.5)+eps) for sdwCORR,vdwCORR,par in zip(vdwCorr,sdwCorr,params)])
# append the new parameters
oParams.append(params)
return oParams
iterations=100
learningRate = .1
beta1 = .9
beta2 = .999
x,y = 5.0,1.0
params = (x,y)
optimizedParameters = optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
params,\
learningRate,\
beta1,\
beta2)