Adaboost的Python和R语言实现

###1.Adaboost

Adaboost基本思想是将一些弱分类器组合，形成一个强分类器。就是所说的“三个臭皮匠赛过诸葛亮”，这里的弱分类器到底有多弱？本文使用的是
单层决策树（decision stump）针对每个变量求均值，然后每个变量都作为一次弱分类器。

Adaboost里面重要的是两种参数，第一个是弱分类器权重参数（代码中的beta),这个是表明某个弱分类器的重要程度的一个参数。第二个参数是样本权重参数（代码中是归一化过的weight，初始值是1/n,表示每个样本都平等参与决策）。李航的书把它叫做样本的概率分布。

adaboost中”Ada”体现在每次训练一个弱分类器，然后加到原有的弱分类器集中。这个过程中参数是如何变动的呢？

1.首先，在已经训练好的弱分类集中我们由上次运算，得出的样本权重weight，作为本次训练的样本权重。

2.然后由损失函数（上一篇文中指出，使用的是指数损失函数）。要使损失函数在训练集上最小。对beta求导并使导数为零。就可以得到本轮训练的弱分类器的权重beta。

3.再更新下一轮计算时的样本权重weight。此weight 还是和刚刚求得的beta有关。方法可以参考下面的python或者R语言的代码： w[j] = weight[j]*sigmoid(+(-)beta[i])；weight = w/w.sum()

跟据上面的方法，可以写出如下python和R代码：

##代码部分：

数据下载digits.csv

###python

import numpy as np
import matplotlib.pyplot as plt
import math
np.random.seed(1)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def train(X,Y):
    (n,p)= X.shape
    weight = np.ones(n)/n
    w = np.ones(n)
    th = X.sum(axis=0)/n
    X1 = np.ones(n)
    error = np.ones(n)
    beta = np.ones(p)
    for i in range(p):
        for j in range(n):
            X1[j] = X[j,i]>th[i]
            error[j] = bool(X1[j])!=bool(Y[j])
        e = (error*weight).sum()
        beta[i] = 0.5*math.log((1-e)/e)
        for j in range(n):
            if(error[j]):
                w[j] = weight[j]*sigmoid(beta[i])
            else:
                w[j] = weight[j]*sigmoid(-beta[i])
        weight = w/w.sum()
    return(beta,th)
def getAccuracy(beta,X,Y,th):
    (n,p)= X.shape
    X1 = np.ones((n,p))
    y_pre = np.ones((n,p))
    Y = 2*Y-1
    num = 0
    for i in range(p):
        for j in range(n):
            X1[j,i] = X[j,i]>th[i]
            X1[j,i] = 2*X1[j,i]-1
            y_pre[j,i] = beta[i]*X1[j,i]
    Y_pre = y_pre.sum(axis = 1)
    for i in range(n):
        if Y_pre[i]*Y[i]>0:
            num = num+1
    accuracy = num/n
    return(accuracy)
# load data
def load_digits(subset=None, normalize=True):
    """
    Load digits and labels from digits.csv.
    Args:
        subset: A subset of digit from 0 to 9 to return.
                If not specified, all digits will be returned.
        normalize: Whether to normalize data values to between 0 and 1.
    Returns:
        digits: Digits data matrix of the subset specified.
                The shape is (n, p), where
                    n is the number of examples,
                    p is the dimension of features.
        labels: Labels of the digits in an (n, ) array.
                Each of label[i] is the label for data[i, :]
    """
    # load digits.csv, adopted from sklearn.
    import pandas as pd
    df = pd.read_csv('digits.csv')
    # only keep the numbers we want.
    if subset is not None:
        df = df[df.iloc[:,-1].isin(subset)]
    # convert to numpy arrays.
    digits = df.iloc[:,:-1].values.astype('float')
    labels = df.iloc[:,-1].values.astype('int')
    # Normalize digit values to 0 and 1.
    if normalize:
        digits -= digits.min()
        digits /= digits.max()
    # Change the labels to 0 and 1.
    for i in range(len(subset)):
        labels[labels == subset[i]] = i
    labels = labels.reshape((labels.shape[0], 1))
    return digits, labels
def split_samples(digits, labels):
    """Split the data into a training set (70%) and a testing set (30%)."""
    num_samples = digits.shape[0]
    num_training = round(num_samples * 0.7)
    indices = np.random.permutation(num_samples)
    training_idx, testing_idx = indices[:num_training], indices[num_training:]
    return (digits[training_idx], labels[training_idx],
            digits[testing_idx], labels[testing_idx])
#====================================
# Load digits and labels.
digits, labels = load_digits(subset=[3, 5], normalize=True)
training_digits, training_labels, testing_digits, testing_labels = split_samples(digits, labels)
print ('# training', training_digits.shape[0])
print ('# testing', testing_digits.shape[0])
# Train a net and display training accuracy.
beta, th = train(training_digits, training_labels)
training_accuracy = getAccuracy(beta, training_digits, training_labels,th)
testing_accuracy = getAccuracy(beta, testing_digits, testing_labels,th)
print ('Accuracy on training data: %f' % training_accuracy)
print ('Accuracy on testing data: %f' % testing_accuracy)

###R语言代码

library(data.table) # allows us to use function fread,
# which quickly reads data from csv files 
# calculate the sigmoid function 
expit <- function(x) 
{ 
  y <- 1/(1+exp(-x)) 
  return(y) 
} 
train <- function(X, Y) 
{
  
  n = dim(X)[1]
  p = dim(X)[2]
  weight = rep(1,n)/n
  w = rep(1,n)
  th = rep(1,p)
  beta = rep(1,p)
  for(i in 1:p)
  {
    th[i] = sum(X[,i])/n
    X1 = X[,i]>th[i]
    error = xor(X1,Y)
    e = sum(error*weight)
    beta[i] = 0.5*log((1-e)/e)
    for (j in 1:n)
    {
      if(error[j])
        w[j] = weight[j]*expit(beta[i])
      else
        w[j] = weight[j]*expit(-beta[i])
    }
    weight = w/sum(w)
  }
  
  return(list(beta,th))
} 
getAccuracy <- function(beta, X, Y, th)
{
  n = dim(X)[1]
  p = dim(X)[2]
  X1 =  matrix(rep(1,n*p),nrow = n)
  y_pre =  matrix(rep(1,n*p),nrow = n)
  Y = 2*Y-1
  num = 0  
  for(i in 1:p)
  {
    X1[,i] = X[,i]>th[i]
    X1[,i] = 2*X1[,i]-1
    y_pre[,i] = beta[i]*X1[,i]
  }
  Y_pre = rowSums(y_pre)
  for(i in 1:n)
  {
    if(sign(Y_pre[i])==Y[i])
    {
      num = num+1
    }
  }
  accuracy = num/n
  return(accuracy)
}
# load data
load_digits <- function(subset=NULL, normalize=TRUE) {
  
  #Load digits and labels from digits.csv.
  
  #Args:
  #subset: A subset of digit from 0 to 9 to return.
  #If not specified, all digits will be returned.
  #normalize: Whether to normalize data values to between 0 and 1.
  
  #Returns:
  #digits: Digits data matrix of the subset specified.
  #The shape is (n, p), where
  #n is the number of examples,
  #p is the dimension of features.
  #labels: Labels of the digits in an (n, ) array.
  #Each of label[i] is the label for data[i, :]
  
  # load digits.csv, adopted from sklearn.
  
  df <- fread("digits.csv") 
  df <- as.matrix(df)
  
  ## only keep the numbers we want.
  if (length(subset)>0) {
    
    c <- dim(df)[2]
    l_col <- df[,c]
    index = NULL
    
    for (i in 1:length(subset)){
      
      number = subset[i]
      index = c(index,which(l_col == number))
    }
    sort(index)
    df = df[index,]
  }
  
  # convert to arrays.
  digits = df[,-1]
  labels = df[,c]
  
  # Normalize digit values to 0 and 1.
  if (normalize == TRUE) {
    digits = digits - min(digits)
    digits = digits/max(digits)}
  
  
  # Change the labels to 0 and 1.
  for (i in 1:length(subset)) {
    labels[labels == subset[i]] = i-1
  }
  
  return(list(digits, labels))
  
}
split_samples <- function(digits,labels) {
  
  # Split the data into a training set (70%) and a testing set (30%).
  
  num_samples <- dim(digits)[1]
  num_training <- round(num_samples*0.7)
  indices = sample(1:num_samples, size = num_samples)
  training_idx <- indices[1:num_training]
  testing_idx <- indices[-(1:num_training)]
  
  return (list(digits[training_idx,], labels[training_idx],
               digits[testing_idx,], labels[testing_idx]))
}
#====================================
# Load digits and labels.
result = load_digits(subset=c(1, 7), normalize=TRUE)
digits = result[[1]]
labels = result[[2]]
result = split_samples(digits,labels)
training_digits = result[[1]]
training_labels = result[[2]]
testing_digits = result[[3]]
testing_labels = result[[4]]
  
# print dimensions
length(training_digits)
length(testing_digits)
# Train a net and display training accuracy.
results = train(training_digits, training_labels)
trainingaccuracy = getAccuracy(results[[1]], training_digits, training_labels,results[[2]]) 
testingaccuracy = getAccuracy(results[[1]], testing_digits, testing_labels,results[[2]])