diff --git a/LogisticRegression/data1.npy b/LogisticRegression/data1.npy new file mode 100644 index 0000000..a95550d Binary files /dev/null and b/LogisticRegression/data1.npy differ diff --git a/LogisticRegression/data1.txt b/LogisticRegression/data1.txt new file mode 100644 index 0000000..3a5f952 --- /dev/null +++ b/LogisticRegression/data1.txt @@ -0,0 +1,100 @@ +34.62365962451697,78.0246928153624,0 +30.28671076822607,43.89499752400101,0 +35.84740876993872,72.90219802708364,0 +60.18259938620976,86.30855209546826,1 +79.0327360507101,75.3443764369103,1 +45.08327747668339,56.3163717815305,0 +61.10666453684766,96.51142588489624,1 +75.02474556738889,46.55401354116538,1 +76.09878670226257,87.42056971926803,1 +84.43281996120035,43.53339331072109,1 +95.86155507093572,38.22527805795094,0 +75.01365838958247,30.60326323428011,0 +82.30705337399482,76.48196330235604,1 +69.36458875970939,97.71869196188608,1 +39.53833914367223,76.03681085115882,0 +53.9710521485623,89.20735013750205,1 +69.07014406283025,52.74046973016765,1 +67.94685547711617,46.67857410673128,0 +70.66150955499435,92.92713789364831,1 +76.97878372747498,47.57596364975532,1 +67.37202754570876,42.83843832029179,0 +89.67677575072079,65.79936592745237,1 +50.534788289883,48.85581152764205,0 +34.21206097786789,44.20952859866288,0 +77.9240914545704,68.9723599933059,1 +62.27101367004632,69.95445795447587,1 +80.1901807509566,44.82162893218353,1 +93.114388797442,38.80067033713209,0 +61.83020602312595,50.25610789244621,0 +38.78580379679423,64.99568095539578,0 +61.379289447425,72.80788731317097,1 +85.40451939411645,57.05198397627122,1 +52.10797973193984,63.12762376881715,0 +52.04540476831827,69.43286012045222,1 +40.23689373545111,71.16774802184875,0 +54.63510555424817,52.21388588061123,0 +33.91550010906887,98.86943574220611,0 +64.17698887494485,80.90806058670817,1 +74.78925295941542,41.57341522824434,0 +34.1836400264419,75.2377203360134,0 +83.90239366249155,56.30804621605327,1 +51.54772026906181,46.85629026349976,0 +94.44336776917852,65.56892160559052,1 +82.36875375713919,40.61825515970618,0 +51.04775177128865,45.82270145776001,0 +62.22267576120188,52.06099194836679,0 +77.19303492601364,70.45820000180959,1 +97.77159928000232,86.7278223300282,1 +62.07306379667647,96.76882412413983,1 +91.56497449807442,88.69629254546599,1 +79.94481794066932,74.16311935043758,1 +99.2725269292572,60.99903099844988,1 +90.54671411399852,43.39060180650027,1 +34.52451385320009,60.39634245837173,0 +50.2864961189907,49.80453881323059,0 +49.58667721632031,59.80895099453265,0 +97.64563396007767,68.86157272420604,1 +32.57720016809309,95.59854761387875,0 +74.24869136721598,69.82457122657193,1 +71.79646205863379,78.45356224515052,1 +75.3956114656803,85.75993667331619,1 +35.28611281526193,47.02051394723416,0 +56.25381749711624,39.26147251058019,0 +30.05882244669796,49.59297386723685,0 +44.66826172480893,66.45008614558913,0 +66.56089447242954,41.09209807936973,0 +40.45755098375164,97.53518548909936,1 +49.07256321908844,51.88321182073966,0 +80.27957401466998,92.11606081344084,1 +66.74671856944039,60.99139402740988,1 +32.72283304060323,43.30717306430063,0 +64.0393204150601,78.03168802018232,1 +72.34649422579923,96.22759296761404,1 +60.45788573918959,73.09499809758037,1 +58.84095621726802,75.85844831279042,1 +99.82785779692128,72.36925193383885,1 +47.26426910848174,88.47586499559782,1 +50.45815980285988,75.80985952982456,1 +60.45555629271532,42.50840943572217,0 +82.22666157785568,42.71987853716458,0 +88.9138964166533,69.80378889835472,1 +94.83450672430196,45.69430680250754,1 +67.31925746917527,66.58935317747915,1 +57.23870631569862,59.51428198012956,1 +80.36675600171273,90.96014789746954,1 +68.46852178591112,85.59430710452014,1 +42.0754545384731,78.84478600148043,0 +75.47770200533905,90.42453899753964,1 +78.63542434898018,96.64742716885644,1 +52.34800398794107,60.76950525602592,0 +94.09433112516793,77.15910509073893,1 +90.44855097096364,87.50879176484702,1 +55.48216114069585,35.57070347228866,0 +74.49269241843041,84.84513684930135,1 +89.84580670720979,45.35828361091658,1 +83.48916274498238,48.38028579728175,1 +42.2617008099817,87.10385094025457,1 +99.31500880510394,68.77540947206617,1 +55.34001756003703,64.9319380069486,1 +74.77589300092767,89.52981289513276,1 diff --git a/LogisticRegression/data2.txt b/LogisticRegression/data2.txt new file mode 100644 index 0000000..a888992 --- /dev/null +++ b/LogisticRegression/data2.txt @@ -0,0 +1,118 @@ +0.051267,0.69956,1 +-0.092742,0.68494,1 +-0.21371,0.69225,1 +-0.375,0.50219,1 +-0.51325,0.46564,1 +-0.52477,0.2098,1 +-0.39804,0.034357,1 +-0.30588,-0.19225,1 +0.016705,-0.40424,1 +0.13191,-0.51389,1 +0.38537,-0.56506,1 +0.52938,-0.5212,1 +0.63882,-0.24342,1 +0.73675,-0.18494,1 +0.54666,0.48757,1 +0.322,0.5826,1 +0.16647,0.53874,1 +-0.046659,0.81652,1 +-0.17339,0.69956,1 +-0.47869,0.63377,1 +-0.60541,0.59722,1 +-0.62846,0.33406,1 +-0.59389,0.005117,1 +-0.42108,-0.27266,1 +-0.11578,-0.39693,1 +0.20104,-0.60161,1 +0.46601,-0.53582,1 +0.67339,-0.53582,1 +-0.13882,0.54605,1 +-0.29435,0.77997,1 +-0.26555,0.96272,1 +-0.16187,0.8019,1 +-0.17339,0.64839,1 +-0.28283,0.47295,1 +-0.36348,0.31213,1 +-0.30012,0.027047,1 +-0.23675,-0.21418,1 +-0.06394,-0.18494,1 +0.062788,-0.16301,1 +0.22984,-0.41155,1 +0.2932,-0.2288,1 +0.48329,-0.18494,1 +0.64459,-0.14108,1 +0.46025,0.012427,1 +0.6273,0.15863,1 +0.57546,0.26827,1 +0.72523,0.44371,1 +0.22408,0.52412,1 +0.44297,0.67032,1 +0.322,0.69225,1 +0.13767,0.57529,1 +-0.0063364,0.39985,1 +-0.092742,0.55336,1 +-0.20795,0.35599,1 +-0.20795,0.17325,1 +-0.43836,0.21711,1 +-0.21947,-0.016813,1 +-0.13882,-0.27266,1 +0.18376,0.93348,0 +0.22408,0.77997,0 +0.29896,0.61915,0 +0.50634,0.75804,0 +0.61578,0.7288,0 +0.60426,0.59722,0 +0.76555,0.50219,0 +0.92684,0.3633,0 +0.82316,0.27558,0 +0.96141,0.085526,0 +0.93836,0.012427,0 +0.86348,-0.082602,0 +0.89804,-0.20687,0 +0.85196,-0.36769,0 +0.82892,-0.5212,0 +0.79435,-0.55775,0 +0.59274,-0.7405,0 +0.51786,-0.5943,0 +0.46601,-0.41886,0 +0.35081,-0.57968,0 +0.28744,-0.76974,0 +0.085829,-0.75512,0 +0.14919,-0.57968,0 +-0.13306,-0.4481,0 +-0.40956,-0.41155,0 +-0.39228,-0.25804,0 +-0.74366,-0.25804,0 +-0.69758,0.041667,0 +-0.75518,0.2902,0 +-0.69758,0.68494,0 +-0.4038,0.70687,0 +-0.38076,0.91886,0 +-0.50749,0.90424,0 +-0.54781,0.70687,0 +0.10311,0.77997,0 +0.057028,0.91886,0 +-0.10426,0.99196,0 +-0.081221,1.1089,0 +0.28744,1.087,0 +0.39689,0.82383,0 +0.63882,0.88962,0 +0.82316,0.66301,0 +0.67339,0.64108,0 +1.0709,0.10015,0 +-0.046659,-0.57968,0 +-0.23675,-0.63816,0 +-0.15035,-0.36769,0 +-0.49021,-0.3019,0 +-0.46717,-0.13377,0 +-0.28859,-0.060673,0 +-0.61118,-0.067982,0 +-0.66302,-0.21418,0 +-0.59965,-0.41886,0 +-0.72638,-0.082602,0 +-0.83007,0.31213,0 +-0.72062,0.53874,0 +-0.59389,0.49488,0 +-0.48445,0.99927,0 +-0.0063364,0.99927,0 +0.63265,-0.030612,0 diff --git a/LogisticRegression/logisticRegression.py b/LogisticRegression/logisticRegression.py new file mode 100644 index 0000000..a117580 --- /dev/null +++ b/LogisticRegression/logisticRegression.py @@ -0,0 +1,153 @@ +#-*- coding: utf-8 -*- +import numpy as np +import matplotlib.pyplot as plt +from scipy import optimize +from matplotlib.font_manager import FontProperties +font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 解决windows环境下画图汉字乱码问题 + + +def LogisticRegression(): + data = loadtxtAndcsv_data("data2.txt", ",", np.float64) + X = data[:,0:-1] + y = data[:,-1] + + plot_data(X,y) # 作图 + + X = mapFeature(X[:,0],X[:,1]) #映射为多项式 + initial_theta = np.zeros((X.shape[1],1))#初始化theta + initial_lambda = 0.1 #初始化正则化系数,一般取0.01,0.1,1..... + + J = costFunction(initial_theta,X,y,initial_lambda) #计算一下给定初始化的theta和lambda求出的代价J + + print J #输出一下计算的值,应该为0.693147 + # result = optimize.fmin(costFunction, initial_theta, args=(X,y,initial_lambda)) #直接使用最小化的方法,效果不好 + '''调用scipy中的优化算法fmin_bfgs(拟牛顿法Broyden-Fletcher-Goldfarb-Shanno) + - costFunction是自己实现的一个求代价的函数, + - initial_theta表示初始化的值, + - fprime指定costFunction的梯度 + - args是其余测参数,以元组的形式传入,最后会将最小化costFunction的theta返回 + ''' + result = optimize.fmin_bfgs(costFunction, initial_theta, fprime=gradient, args=(X,y,initial_lambda)) + p = predict(X, result) #预测 + print u'在训练集上的准确度为%f%%'%np.mean(np.float64(p==y)*100) # 与真实值比较,p==y返回True,转化为float + + X = data[:,0:-1] + y = data[:,-1] + plotDecisionBoundary(result,X,y) #画决策边界 + + + +# 加载txt和csv文件 +def loadtxtAndcsv_data(fileName,split,dataType): + return np.loadtxt(fileName,delimiter=split,dtype=dataType) + +# 加载npy文件 +def loadnpy_data(fileName): + return np.load(fileName) + +# 显示二维图形 +def plot_data(X,y): + pos = np.where(y==1) #找到y==1的坐标位置 + neg = np.where(y==0) #找到y==0的坐标位置 + #作图 + plt.figure(figsize=(15,12)) + plt.plot(X[pos,0],X[pos,1],'ro') # red o + plt.plot(X[neg,0],X[neg,1],'bo') # blue o + plt.title(u"两个类别散点图",fontproperties=font) + plt.show() + +# 映射为多项式 +def mapFeature(X1,X2): + degree = 3; # 映射的最高次方 + out = np.ones((X1.shape[0],1)) # 映射后的结果数组(取代X) + ''' + 这里以degree=2为例,映射为1,x1,x2,x1^2,x1,x2,x2^2 + ''' + for i in np.arange(1,degree+1): + for j in range(i+1): + temp = X1**(i-j)*(X2**j) #矩阵直接乘相当于matlab中的点乘.* + out = np.hstack((out, temp.reshape(-1,1))) + return out + +# 代价函数 +def costFunction(initial_theta,X,y,inital_lambda): + m = len(y) + J = 0 + + h = sigmoid(np.dot(X,initial_theta)) # 计算h(z) + theta1 = initial_theta.copy() # 因为正则化j=1从1开始,不包含0,所以赋值一份,前theta(0)值为0 + theta1[0] = 0 + + temp = np.dot(np.transpose(theta1),theta1) + J = (-np.dot(np.transpose(y),np.log(h))-np.dot(np.transpose(1-y),np.log(1-h))+temp*inital_lambda/2)/m # 正则化的代价方程 + return J + +# 计算梯度 +def gradient(initial_theta,X,y,inital_lambda): + m = len(y) + grad = np.zeros((initial_theta.shape[0])) + + h = sigmoid(np.dot(X,initial_theta))# 计算h(z) + theta1 = initial_theta.copy() + theta1[0] = 0 + + grad = np.dot(np.transpose(X),h-y)/m+inital_lambda/m*theta1 #正则化的梯度 + return grad + +# S型函数 +def sigmoid(z): + h = np.zeros((len(z),1)) # 初始化,与z的长度一置 + + h = 1.0/(1+np.exp(-z)) + return h + + +#画决策边界 +def plotDecisionBoundary(theta,X,y): + pos = np.where(y==1) #找到y==1的坐标位置 + neg = np.where(y==0) #找到y==0的坐标位置 + #作图 + plt.figure(figsize=(15,12)) + plt.plot(X[pos,0],X[pos,1],'ro') # red o + plt.plot(X[neg,0],X[neg,1],'bo') # blue o + plt.title(u"决策边界",fontproperties=font) + + #u = np.linspace(30,100,100) + #v = np.linspace(30,100,100) + + u = np.linspace(-1,1.5,50) #根据具体的数据,这里需要调整 + v = np.linspace(-1,1.5,50) + + z = np.zeros((len(u),len(v))) + for i in range(len(u)): + for j in range(len(v)): + z[i,j] = np.dot(mapFeature(u[i].reshape(1,-1),v[j].reshape(1,-1)),theta) # 计算对应的值,需要map + + z = np.transpose(z) + plt.contour(u,v,z,[0,0.01],linewidth=2.0) # 画等高线,范围在[0,0.01],即近似为决策边界 + #plt.legend() + plt.show() + +# 预测 +def predict(X,theta): + m = X.shape[0] + p = np.zeros((m,1)) + p = sigmoid(np.dot(X,theta)) # 预测的结果,是个概率值 + + for i in range(m): + if p[i] > 0.5: #概率大于0.5预测为1,否则预测为0 + p[i] = 1 + else: + p[i] = 0 + return p + + +# 测试逻辑回归函数 +def testLogisticRegression(): + LogisticRegression() + + +if __name__ == "__main__": + testLogisticRegression() + +