2020吴恩达 machine learning 编程作业 python实现 ex8

    技术2025-10-10  4

    # -*- coding: utf-8 -*- """ Created on Sat Jul 4 01:40:18 2020 @author: cheetah023 """ import numpy as np import scipy.io as sci import matplotlib.pyplot as plt #函数定义 def estimateGaussian(X): m,n = X.shape mu = np.zeros([n,1]) sigma2 = np.zeros([n]) mu = np.sum(X,axis=0).T / m for i in range(0,n): sigma2[i] = np.sum((X[:,i] - mu[i]) ** 2) / m return mu, sigma2 def multivariateGaussian(X, mu, sigma2): k = len(mu) m = X.shape[0] if np.ndim(sigma2) == 1: sigma2 = np.diag(sigma2) right = np.zeros([m,1]) left = np.power(2*np.pi,-(k/2)) left = left * np.power(np.linalg.det(sigma2),-(1/2)) for i in range(0,m): right[i] = np.exp(-(1/2) * np.dot(np.dot((X[i] - mu).T,np.linalg.inv(sigma2)),X[i]-mu)) p = left * right return p def visualizeFit(X, mu, sigma2): x1 = np.linspace(0,30,50) x2 = np.linspace(0,30,50) xx,yy = np.meshgrid(x1,x2) X_temp = np.column_stack([xx.flatten(),yy.flatten()]) print('X_temp:',X_temp.shape) p = multivariateGaussian(X_temp, mu, sigma2) p = np.reshape(p,xx.shape) levels = [10**h for h in range(-20,0,3)] plt.contour(xx,yy,p,levels) def selectThreshold(yval, pval): epsilons = np.linspace(min(pval),max(pval),1000) bestEpsilon = 0 bestF1 = 0 for epsilon in epsilons: tp=fp=fn=0 for i in range(0,len(pval)): if pval[i] < epsilon and yval[i] == 1: tp += 1 elif pval[i] < epsilon and yval[i] == 0: fp += 1 elif pval[i] >= epsilon and yval[i] == 1: fn += 1 if tp + fp: prec = tp / (tp + fp) rec = tp / (tp + fn) F1 = 2 * (prec * rec) / (prec + rec) else: F1 = 0 if F1 > bestF1: bestEpsilon = epsilon bestF1 = F1 return bestEpsilon,bestF1 #Part 1: Load Example Dataset data = sci.loadmat('ex8data1.mat') #print('data.keys',data.keys()) X = data['X'] Xval = data['Xval'] yval = data['yval'] #print('X:',X.shape) #print('Xval:',Xval.shape) #print('yval:',yval.shape) plt.figure(0) plt.scatter(X[:,0],X[:,1],marker='x',c='b') plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') #Part 2: Estimate the dataset statistics [mu,sigma2] = estimateGaussian(X) p = multivariateGaussian(X, mu, sigma2) visualizeFit(X, mu, sigma2) #Part 3: Find Outliers pval = multivariateGaussian(Xval, mu, sigma2) [epsilon, F1] = selectThreshold(yval, pval) print('Best epsilon found using cross-validation:', epsilon) print('Best F1 on Cross Validation Set:', F1) print('(you should see a value epsilon of about 8.99e-05)') print('(you should see a Best F1 value of 0.875000)\n') outliers = np.where(p < epsilon) plt.scatter(X[outliers[0],0],X[outliers[0],1],marker='o',edgecolor='r') #Part 4: Multidimensional Outliers data = sci.loadmat('ex8data2.mat') #print('data.keys',data.keys()) X = data['X'] Xval = data['Xval'] yval = data['yval'] #print('X:',X.shape) #print('Xval:',Xval.shape) #print('yval:',yval.shape) [mu, sigma2] = estimateGaussian(X) p = multivariateGaussian(X, mu, sigma2) pval = multivariateGaussian(Xval, mu, sigma2) [epsilon, F1] = selectThreshold(yval, pval) print('for ex8data2.mat:') print('Best epsilon found using cross-validation:', epsilon) print('Best F1 on Cross Validation Set:', F1) print('(you should see a value epsilon of about 1.38e-18)') print('(you should see a Best F1 value of 0.615385)') print('# Outliers found:', sum(p < epsilon))

    运行结果:

    X_temp: (2500, 2) Best epsilon found using cross-validation: [8.99985263e-05] Best F1 on Cross Validation Set: 0.8750000000000001 (you should see a value epsilon of about 8.99e-05) (you should see a Best F1 value of  0.875000)

    for ex8data2.mat: Best epsilon found using cross-validation: [1.3786075e-18] Best F1 on Cross Validation Set: 0.6153846153846154 (you should see a value epsilon of about 1.38e-18) (you should see a Best F1 value of 0.615385) # Outliers found: [117]

    总结:

    1、在实现selectThreshold的时候本来想用矩阵的逻辑运算来实现,没成功。还是使用的两个for循环,不过看着还行逻辑比较清楚

    2、在计算多变量高斯分布multivariateGaussian()时,使用的是这个公式

     

     

     

    Processed: 0.011, SQL: 9