import numpy as np
import matplotlib.pyplot as plt
import math
np.random.seed(1)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def train(X,Y):
(n,p)= X.shape
weight = np.ones(n)/n
w = np.ones(n)
th = X.sum(axis=0)/n
X1 = np.ones(n)
error = np.ones(n)
beta = np.ones(p)
for i in range(p):
for j in range(n):
X1[j] = X[j,i]>th[i]
error[j] = bool(X1[j])!=bool(Y[j])
e = (error*weight).sum()
beta[i] = 0.5*math.log((1-e)/e)
for j in range(n):
if(error[j]):
w[j] = weight[j]*sigmoid(beta[i])
else:
w[j] = weight[j]*sigmoid(-beta[i])
weight = w/w.sum()
return(beta,th)
def getAccuracy(beta,X,Y,th):
(n,p)= X.shape
X1 = np.ones((n,p))
y_pre = np.ones((n,p))
Y = 2*Y-1
num = 0
for i in range(p):
for j in range(n):
X1[j,i] = X[j,i]>th[i]
X1[j,i] = 2*X1[j,i]-1
y_pre[j,i] = beta[i]*X1[j,i]
Y_pre = y_pre.sum(axis = 1)
for i in range(n):
if Y_pre[i]*Y[i]>0:
num = num+1
accuracy = num/n
return(accuracy)
def load_digits(subset=None, normalize=True):
"""
Load digits and labels from digits.csv.
Args:
subset: A subset of digit from 0 to 9 to return.
If not specified, all digits will be returned.
normalize: Whether to normalize data values to between 0 and 1.
Returns:
digits: Digits data matrix of the subset specified.
The shape is (n, p), where
n is the number of examples,
p is the dimension of features.
labels: Labels of the digits in an (n, ) array.
Each of label[i] is the label for data[i, :]
"""
import pandas as pd
df = pd.read_csv('digits.csv')
if subset is not None:
df = df[df.iloc[:,-1].isin(subset)]
digits = df.iloc[:,:-1].values.astype('float')
labels = df.iloc[:,-1].values.astype('int')
if normalize:
digits -= digits.min()
digits /= digits.max()
for i in range(len(subset)):
labels[labels == subset[i]] = i
labels = labels.reshape((labels.shape[0], 1))
return digits, labels
def split_samples(digits, labels):
"""Split the data into a training set (70%) and a testing set (30%)."""
num_samples = digits.shape[0]
num_training = round(num_samples * 0.7)
indices = np.random.permutation(num_samples)
training_idx, testing_idx = indices[:num_training], indices[num_training:]
return (digits[training_idx], labels[training_idx],
digits[testing_idx], labels[testing_idx])
digits, labels = load_digits(subset=[3, 5], normalize=True)
training_digits, training_labels, testing_digits, testing_labels = split_samples(digits, labels)
print ('# training', training_digits.shape[0])
print ('# testing', testing_digits.shape[0])
beta, th = train(training_digits, training_labels)
training_accuracy = getAccuracy(beta, training_digits, training_labels,th)
testing_accuracy = getAccuracy(beta, testing_digits, testing_labels,th)
print ('Accuracy on training data: %f' % training_accuracy)
print ('Accuracy on testing data: %f' % testing_accuracy)