Negative/Positive Thinking

SCWを試す

機械学習プログラミング

はじめに

分類器の決定版(？)的なSoft Confidence Weighted Learningを試してみた。

Soft Confidence Weighted Learningとは

2012年に提案された、各重みを正規分布と考え更新時にその分布が変わるようにしたConfidence Weighted(CW)関係のノイズに強くなった版
オンライン学習
http://icml.cc/2012/papers/86.pdf

詳しい解説記事

http://d.hatena.ne.jp/kisa12012/20120625/1340616659

使用したデータ

LIBSVMのページにあるUCIデータセットのa9aを用いた
- http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
- 学習データ : a9a
- テストデータ : a9a.t

コード

毎度のことながら怪しいコード。

#include <iostream>
#include <vector>
#include <cmath>
#include <fstream>
#include <sstream>
#include <map>

class SCW {
  static const int erf_N = 30;
  static const double erf_PI = 3.14159265358979323846264338;
  int m;
  double eta, C, phi;
  std::vector<double> mu;
  std::vector< std::vector<double> > sigma;


  double erf(double z){
    //反復的に計算
    double res = 0;
    for(int n=0; n<=erf_N; n++){
      double tmp = 1.0;
      for(int k=1; k<=n; k++){
        tmp *= -1.0 * z * z / k;
      }
      res += z / (2.0 * n + 1.0) * tmp;
    }
    return 2.0 / sqrt(erf_PI) * res;
  }

  //標準正規分布の累積分布関数
  double CDF_normal(double x){
    return 0.5 * (1.0 + erf(x / sqrt(2.0)));
  }

  //行列やベクトルの計算用
  double quad(const std::vector<double>& x, 
              const std::vector< std::vector<double> >& sig,
              const std::vector<double>& y){
    std::vector<double> tmp(m, 0.0);
    for(size_t i=0; i<m; i++){
      for(size_t j=0; j<m; j++){
        tmp[i] += sig[i][j] * y[j];
      }
    }
    return dot(x, tmp);
  }
  double dot(const std::vector<double>& x, const std::vector<double>& y){
    double res = 0.0;
    for(size_t i=0; i<m; i++){
      res += x[i] * y[i];
    }
    return res;
  }
  std::vector<double> mat_prod1(const std::vector< std::vector<double> >& sig,
                   const std::vector<double>& y){
    std::vector<double> tmp(m, 0.0);
    for(size_t i=0; i<m; i++){
      for(size_t j=0; j<m; j++){
        tmp[i] += sig[i][j] * y[j];
      }
    }
    return tmp;
  }
  std::vector< std::vector<double> > mat_prod2(const std::vector< std::vector<double> >& mat1,
                                               const std::vector< std::vector<double> >& mat2){
    std::vector< std::vector<double> > res(m, std::vector<double>(m, 0.0));
    for(size_t i=0; i<m; i++){
      for(size_t j=0; j<m; j++){
        for(size_t k=0; k<m; k++){
          res[i][j] += mat1[i][k] * mat2[k][j];
        }        
      }
    }
    return res;
  }
  std::vector< std::vector<double> > mat_sub_and_scalarprod(const std::vector< std::vector<double> >& mat1,
                                                            double scal,
                                                            const std::vector< std::vector<double> >& mat2){
    std::vector< std::vector<double> > res(m, std::vector<double>(m, 0.0));
    for(size_t i=0; i<m; i++){
      for(size_t j=0; j<m; j++){
        res[i][j] = mat1[i][j] - scal * mat2[i][j];
      }
    }
    return res;
  }
    

  //損失関数
  double suffer_loss(int y, const std::vector<double>& x){
    double res = phi * sqrt(quad(x, sigma, x)) - y * dot(x, mu);
    //return std::max(0.0, res);
    return res;
  }

public:
  SCW(int m_, double eta_, double C_):m(m_),eta(eta_),C(C_),
                                      mu(m_, 0.0),
                                      sigma(m_, std::vector<double>(m_, 0.0)){
    phi = 1.0 / CDF_normal(eta);
    for(size_t i=0; i<m; i++){
      sigma[i][i] = 1.0;
    }
  }

  int predict(const std::vector<double>& x){
    if(dot(x, mu) < 0.0) return -1;
    return 1;
  }

  void train(int y, const std::vector<double>& x){
    if(suffer_loss(y, x) > 0.0){
      /*
      //SCW-I
      double vt = quad(x, sigma, x);
      double zeta = 1 + phi * phi;
      double mt = y * dot(x, mu);
      double psi = 1.0 + (phi * phi) / 2.0;
      double alpha_tmp = (-mt*psi + sqrt(mt*mt*phi*phi*phi*phi/4.0 + vt*phi*phi*zeta)) / (vt * zeta);
      double alpha = std::min(C, std::max(0.0, alpha_tmp));
      double ut_tmp = -alpha*vt*phi + sqrt(alpha*alpha*vt*vt*phi*phi + 4.0*vt);
      double ut = ut_tmp * ut_tmp / 4.0;
      double beta = (alpha*phi) / (sqrt(ut) + vt*alpha*phi);
      */

      //SCW-II
      double vt = quad(x, sigma, x);
      double nt = vt + 1.0 / (2.0 * C);
      double mt = y * dot(x, mu);
      double gammat = phi * sqrt(phi*phi*mt*mt*vt*vt + 4.0*nt*vt*(nt+vt*phi*phi));
      double alpha_tmp = (-(2.0*mt*nt + phi*phi*mt*vt) + gammat) / (2.0*(nt*nt + nt*vt*phi*phi));
      double alpha = std::max(0.0, alpha_tmp);
      double ut_tmp = -alpha*vt*phi + sqrt(alpha*alpha*vt*vt*phi*phi + 4.0*vt);
      double ut = ut_tmp * ut_tmp / 4.0;
      double beta = alpha * phi / (sqrt(ut) + vt*alpha*phi);


      std::vector<double> sigx = mat_prod1(sigma, x);
      for(size_t i=0; i<m; i++){
        mu[i] += alpha * y * sigx[i];
      }
      std::vector< std::vector<double> > xtx(m, std::vector<double>(m, 0.0));
      for(size_t i=0; i<m; i++){
        for(size_t j=0; j<m; j++){
          xtx[i][j] = x[i] * x[j];
        }
      }
      sigma = mat_sub_and_scalarprod(sigma, beta, mat_prod2(sigma, mat_prod2(xtx, sigma)));
    }
  }

};


std::pair< int, std::vector<double> > parse_data(int m, const std::string& line){
  std::vector<double> res(m, 0.0);
  int y;

  std::stringstream ss(line);
  std::string str;
  ss >> y;
  while(ss >> str){
    int n = 0;
    size_t i = 0;
    for(; i<str.length(); i++){
      if(str[i] == ':'){ i++; break; }
      n = 10*n + (str[i]-'0');
    }
    std::stringstream tt(str.substr(i));
    double val;
    tt >> val;
    //std::cerr << n << " : " << val << std::endl;
    res[n-1] = val; //追記:0-originに直した
  }
  return std::make_pair< int, std::vector<double> >(y, res);
}

int main(int argc, char** argv){
  std::string line;

  SCW scw(123, 0.9, 1.0);
  
  //学習
  std::ifstream train_data(argv[1]);
  while(std::getline(train_data, line)){
    std::pair< int,std::vector<double> > res = parse_data(123, line);
    scw.train(res.first, res.second);
  }

  //評価
  int all = 0, ok = 0;
  std::ifstream test_data(argv[2]);
  while(std::getline(test_data, line)){
    std::pair< int,std::vector<double> > res = parse_data(123, line);
    int pred = scw.predict(res.second);
    if(pred == res.first){
      ok++;
    }
    all++;
  }

  std::cout << "Accuracy = " << (100.0 * ok / all) << "% (" << ok << "/" << all << ")" << std::endl;

  return 0;
}

結果

適当な値で試してみた。

SCW-I, eta=0.9, C=1.0
- Accuracy = 83.8155% (13646/16281)
SCW-I, eta=0.8, C=1.0
- Accuracy = 83.7418% (13634/16281)
SCW-II, eta=0.9, C=1.0
- Accuracy = 84.3929% (13740/16281)

なかなかいい値がでているよう。

分散の変化

追記：2013/5/11
SCW-I, eta=0.9, C=1.0のケースについて、学習時の分散sigmaの対角要素の値の変化をプロットしてみた。
(縦軸：各対角要素の値、横軸：i番目の学習データ)

他の結果