盒子
盒子
文章目录
  1. 1.SVM
  • 代码部分:
    1. python
  • R语言部分
  • 梯度下降实现SVM

    1.SVM

    与逻辑斯蒂回归,提升方法类似区别在于代理损失函数,逻辑斯蒂回归用的是逻辑斯蒂损失函数,提升方法使用的是指数损失函数,而SVM则使用的是合页损失函数

    分别记为:

    1,exp(-yf(x))

    2, log[1+exp(-yf(x))]

    3, [1-yf(x)]+

    (最后一个加号是下标,是非负的表示方法)

    跟据上面的方法,使用梯度下降法可以写出如下python和R代码

    思路见图

    数据下载digits.csv


    代码部分:

    python

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    import numpy as np
    import matplotlib.pyplot as plt
    import math
    np.random.seed(1)
    def train(X,Y,num_iterations=1000,learning_rate=0.01,lamda=0.1):
    (n,p)= X.shape #定义特征矩阵n个样本,p维特征
    p=p+1 #加上常数项,特征维度加一
    X1 = np.hstack((np.ones(n).reshape((n, 1)),X)) #hstack函数能把
    Y = 2*Y-1 #Y由0和1的取值转化成正负一取值,即用来表示正样例和负样例
    beta = np.zeros(p).reshape(p,1) #beta参数初始化
    for i in range(num_iterations):
    s = np.dot(X1,beta)
    db = s*Y<1
    dbeta = np.dot(np.ones(n),np.tile(db*Y,(1,p))*X1)
    #tile 这里表示将db*Y 在行方向表示一次,列方向表示p次,相当于列方向复制db*Y共p次
    beta = beta + (learning_rate*dbeta).reshape(p,1)
    beta[2:p]=beta[2:p]-lamda*beta[2:p]
    return(beta)
    def getAccuracy(beta,X,Y):
    (n,p)= X.shape
    X1 = np.hstack((np.ones(n).reshape((n, 1)),X))
    p = np.dot(X1,beta)
    Y = 2*Y-1
    num = 0
    for i in range(n):
    if p[i]*Y[i]>0: num = num + 1
    accuracy = num/n
    return(accuracy)
    # load data
    def load_digits(subset=None, normalize=True):
    """
    Load digits and labels from digits.csv.
    Args:
    subset: A subset of digit from 0 to 9 to return.
    If not specified, all digits will be returned.
    normalize: Whether to normalize data values to between 0 and 1.
    Returns:
    digits: Digits data matrix of the subset specified.
    The shape is (n, p), where
    n is the number of examples,
    p is the dimension of features.
    labels: Labels of the digits in an (n, ) array.
    Each of label[i] is the label for data[i, :]
    """
    # load digits.csv, adopted from sklearn.
    import pandas as pd
    df = pd.read_csv('digits.csv')
    # only keep the numbers we want.
    if subset is not None:
    df = df[df.iloc[:,-1].isin(subset)]
    # convert to numpy arrays.
    digits = df.iloc[:,:-1].values.astype('float')
    labels = df.iloc[:,-1].values.astype('int')
    # Normalize digit values to 0 and 1.
    if normalize:
    digits -= digits.min()
    digits /= digits.max()
    # Change the labels to 0 and 1.
    for i in range(len(subset)):
    labels[labels == subset[i]] = i
    labels = labels.reshape((labels.shape[0], 1))
    return digits, labels
    def split_samples(digits, labels):
    """Split the data into a training set (70%) and a testing set (30%)."""
    num_samples = digits.shape[0]
    num_training = round(num_samples * 0.7)
    indices = np.random.permutation(num_samples)
    training_idx, testing_idx = indices[:num_training], indices[num_training:]
    return (digits[training_idx], labels[training_idx],
    digits[testing_idx], labels[testing_idx])
    #====================================
    # Load digits and labels.
    digits, labels = load_digits(subset=[3, 5], normalize=True)
    training_digits, training_labels, testing_digits, testing_labels = split_samples(digits, labels)
    print ('# training', training_digits.shape[0])
    print ('# testing', testing_digits.shape[0])
    # Train a net and display training accuracy.
    beta = train(training_digits, training_labels)
    training_accuracy = getAccuracy(beta, training_digits, training_labels)
    testing_accuracy = getAccuracy(beta, testing_digits, testing_labels)
    print ('Accuracy on training data: %f' % training_accuracy)
    print ('Accuracy on testing data: %f' % testing_accuracy)

    R语言部分

    library(data.table) # allows us to use function fread,
    # which quickly reads data from csv files 
    
    
    train<-function(X,Y,num_iterations=1000,learning_rate=0.01,lamda=0.1)  #lamda is regularization coefficient
    {
      n=dim(X)[1]
      p=dim(X)[2]+1
      X1=cbind(rep(1,n),X)
      Y=2*Y-1
      beta=matrix(rep(0,p),nrow=p)
      for (i in 1:num_iterations)
      {
        S=X1%*%beta
        db=S*Y<1
        #这里合理的矩阵的使用可以进行矩阵行列求和,矩阵行列逐个数相乘
        dbeta = matrix(rep(1,n),nrow=1)%*%(matrix(db*Y,n,p)*X1)
    
        beta=beta+learning_rate*t(dbeta)
        beta[2:p]=beta[2:p]-lamda*beta[2:p]
      }
     return(beta)
    }
    
    
    getAccuracy <- function(beta, X, Y)
    {
      numSamples = dim(X)[1]
      X1 = cbind(rep(1, numSamples), X)
      p = sign(X1%*%beta)
      Y = 2*Y-1
      num = 0
      for(i in 1:numSamples)
      {
        if(p[i]==Y[i])
          num = num + 1
      }
      accuracy = num/numSamples
      return(accuracy)
    }
    # load data
    load_digits <- function(subset=NULL, normalize=TRUE) {
    
      #Load digits and labels from digits.csv.
    
      #Args:
      #subset: A subset of digit from 0 to 9 to return.
      #If not specified, all digits will be returned.
      #normalize: Whether to normalize data values to between 0 and 1.
    
      #Returns:
      #digits: Digits data matrix of the subset specified.
      #The shape is (n, p), where
      #n is the number of examples,
      #p is the dimension of features.
      #labels: Labels of the digits in an (n, ) array.
      #Each of label[i] is the label for data[i, :]
    
      # load digits.csv, adopted from sklearn.
    
      df <- fread("digits.csv") 
      df <- as.matrix(df)
    
      ## only keep the numbers we want.
      if (length(subset)>0) {
    
        c <- dim(df)[2]
        l_col <- df[,c]
        index = NULL
    
        for (i in 1:length(subset)){
    
          number = subset[i]
          index = c(index,which(l_col == number))
        }
        sort(index)
        df = df[index,]
      }
    
      # convert to arrays.
      digits = df[,-1]
      labels = df[,c]
    
      # Normalize digit values to 0 and 1.
      if (normalize == TRUE) {
        digits = digits - min(digits)
        digits = digits/max(digits)}
    
    
      # Change the labels to 0 and 1.
      for (i in 1:length(subset)) {
        labels[labels == subset[i]] = i-1
      }
    
      return(list(digits, labels))
    
    }
    
    split_samples <- function(digits,labels) {
    
      # Split the data into a training set (70%) and a testing set (30%).
    
      num_samples <- dim(digits)[1]
      num_training <- round(num_samples*0.7)
      indices = sample(1:num_samples, size = num_samples)
      training_idx <- indices[1:num_training]
      testing_idx <- indices[-(1:num_training)]
    
      return (list(digits[training_idx,], labels[training_idx],
                   digits[testing_idx,], labels[testing_idx]))
    }
    
    #====================================
    # Load digits and labels.
    result = load_digits(subset=c(1, 7), normalize=TRUE)
    digits = result[[1]]
    labels = result[[2]]
    
    result = split_samples(digits,labels)
    training_digits = result[[1]]
    training_labels = result[[2]]
    testing_digits = result[[3]]
    testing_labels = result[[4]]
    
    # print dimensions
    length(training_digits)
    length(testing_digits)
    
    # Train a net and display training accuracy.
    beta = train(training_digits, training_labels)
    
    trainingaccuracy = getAccuracy(beta, training_digits, training_labels) 
    testingaccuracy = getAccuracy(beta, testing_digits, testing_labels)