ロジスティック回帰で分類を試す

はじめに

そういえばliblinearよく使うのにロジスティック回帰自分で書いた事ないなぁと思ったので、ちょっと書いてみた。

詳しい解説記事

とてもいい感じの連載がされている。

http://gihyo.jp/dev/serial/01/machine-learning

L1/L2正則化については以下も参照。

http://www.slideshare.net/guo_dong/logistic-regressionpptx

使用したデータ

LIBSVMのページにあるUCIデータセットのa9aを用いた
- http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
- 学習データ : a9a
- テストデータ : a9a.t

コード

結果でてるのでたぶん合ってる。

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
#include <unordered_map>
#include <cmath>


//乱数計算用関数
//xorshift
// 注意: longではなくint(32bit)にすべき
unsigned long xor128(){
  static unsigned long x=123456789, y=362436069, z=521288629, w=88675123;
  unsigned long t;
  t=(x^(x<<11));
  x=y; y=z; z=w;
  return w=(w^(w>>19))^(t^(t>>8));
}
//[0,1)の一様乱数
// 注意: int_maxぐらいで割った方がよい
double frand(){
  return xor128()%10000000000/static_cast<double>(10000000000); 
}


//ロジスティック回帰クラス
class LR {
  double eta_; //学習率
  double C_; //正則化用パラメータ
  std::unordered_map<std::string,double> weights; //重み

  //シグモイド関数
  double sigmoid(double z){
    return 1.0 / ( 1.0 + exp(-z) );
  }

  //シグマ関数(素性ベクトルと重みから確率値を計算)
  double sigma(const std::vector< std::pair<std::string,double> >& x){
    double z = 0;
    for(size_t i=0; i<x.size(); i++){
      z += weights[ x[i].first ] * x[i].second ;
    }
    return sigmoid(z);
  }

public:
  LR(double eta, double C):eta_(eta),C_(C){}

  //素性の重みをランダム値で初期化
  void setRandWeight(const std::string& feat){
    weights[ feat ] = frand();
  }
  //学習率の指定
  void setEta(double eta){
    eta_ = eta;
  }

  //予測関数
  int predict(const std::vector< std::pair<std::string,double> >& x){
    return sigma(x) > 0.5 ? 1 : 0;
  }
  //確率値の計算
  double prob(const std::vector< std::pair<std::string,double> >& x){
    return sigma(x);
  }
  
  //正則化なしロジスティック回帰
  void train(int t, const std::vector< std::pair<std::string,double> >& x){
    double pred = sigma(x);
    for(size_t i=0; i<x.size(); i++){
      weights[ x[i].first ] -= eta_ * ( (pred-t) * x[i].second );
    }    
  }

  //L1正則化ロジスティック回帰
  void trainL1(int t, const std::vector< std::pair<std::string,double> >& x){
    double pred = sigma(x);
    std::unordered_map<std::string,double> xx;
    for(size_t i=0; i<x.size(); i++){
      xx[x[i].first] = x[i].second;
    }

    std::unordered_map<std::string,double>::iterator itr;
    for(itr = weights.begin(); itr != weights.end(); ++itr){
      if(weights[ itr->first ] > 0){
        weights[ itr->first ] -= eta_ * ( (pred-t) * xx[itr->first] + C_);
        if(weights[ itr->first ] < 0) weights[ itr->first ] = 0;
      }else{
        weights[ itr->first ] -= eta_ * ( (pred-t) * xx[itr->first] - C_);
        if(weights[ itr->first ] > 0) weights[ itr->first ] = 0;
      }
    }    
  }

  //L2正則化ロジスティック回帰
  void trainL2(int t, const std::vector< std::pair<std::string,double> >& x){
    double pred = sigma(x);
    std::unordered_map<std::string,double> xx;
    for(size_t i=0; i<x.size(); i++){
      xx[x[i].first] = x[i].second;
    }

    std::unordered_map<std::string,double>::iterator itr;
    for(itr = weights.begin(); itr != weights.end(); ++itr){
      weights[ itr->first ] -= eta_ * ( (pred-t) * xx[itr->first] + C_ * weights[ itr->first ]);
    }    
  }


  //重みが0の素性の数
  int zero_weights(){
    int ret = 0;
    std::unordered_map<std::string,double>::iterator itr = weights.begin();
    for(; itr != weights.end(); ++itr){
      if(fabs(itr->second) < 10e-8) ret++;
    }
    return ret;
  }
  //素性の数
  int weights_num(){
    return weights.size();
  }
  
  //ファイルへ重みを出力
  void save(const std::string& filename){
    std::ofstream ofs(filename);
    std::unordered_map<std::string,double>::iterator itr = weights.begin();
    for(; itr != weights.end(); ++itr){
      ofs << itr->first << " " << itr->second << std::endl;
    }
  }

  //ファイルから重みを入力
  void load(const std::string& filename){
    std::ifstream ifs(filename);
    weights.clear();
    
    std::string name;
    double value;
    while(ifs >> name >> value){
      weights[ name ] = value;
    }
  }
};


int main(){

  //パラメータ
  int iter_num = 50; //学習の反復回数
  double eta = 0.01; //学習率
  double C = 0.01; //正則化のパラメータ
  std::ifstream trainfile("./a9a"); //学習データファイル
  std::ifstream testfile("./a9a.t"); //評価データファイル



  LR lr(eta, C);
  std::vector<int> dat_t;
  std::vector< std::vector< std::pair<std::string,double> > > dat;
  std::string line, feat;

  //学習////////////////////////////////////////////////
  //学習データの読み込み
  while(std::getline(trainfile, line)){
    int t;
    std::stringstream ss(line);
    ss >> t;
    if(t>0) t = 1;
    else t = 0;
    dat_t.push_back(t);
    
    std::vector< std::pair<std::string,double> > v;
    while(ss >> feat){
      std::string::size_type idx = feat.find(":");
      std::string name = feat.substr(0, idx);
      double value = atof(feat.substr(idx+1).c_str());

      v.push_back(std::make_pair(name, value));
      lr.setRandWeight(name); //素性の重みをランダム値で初期化しておく
    }
    dat.push_back(v);
  }
  //学習ループ
  for(int t=0; t<iter_num; t++){
    for(int i=0; i<dat.size(); i++){

      lr.train(dat_t[i], dat[i]); //正則化なし      
      //lr.trainL1(dat_t[i], dat[i]); //L1正則化
      //lr.trainL2(dat_t[i], dat[i]); //L2正則化

    }
    eta *= 0.5;
    lr.setEta(eta); //学習率を更新

    //現在の評価値を表示
    int cnt = 0, cnt_zero = 0;
    for(int i=0; i<dat.size(); i++){
      if(dat_t[i] == lr.predict(dat[i])) cnt++;
    }
    std::cerr << "\r";
    std::cerr << t << ":\t" << cnt << "/" << dat.size() << "  closedAcc=" << (double)cnt / dat.size();
    std::cerr << "  ZeroWeights=" << lr.zero_weights() << "/" << lr.weights_num() << std::flush;
  }
  std::cerr << std::endl;

  //重みデータの出力
  lr.save("weights.txt");



  //予測////////////////////////////////////////////////
  dat_t.clear();
  dat.clear();
  lr.load("weights.txt"); //重みデータの読み込み

  //テストデータの読み込み
  while(std::getline(testfile, line)){
    int t;
    std::stringstream ss(line);
    ss >> t;
    if(t>0) t = 1;
    else t = 0;
    dat_t.push_back(t);
    
    std::vector< std::pair<std::string,double> > v;
    while(ss >> feat){
      std::string::size_type idx = feat.find(":");
      std::string name = feat.substr(0, idx);
      double value = atof(feat.substr(idx+1).c_str());

      v.push_back(std::make_pair(name, value));
    }
    dat.push_back(v);
  }

  //評価
  {
    int cnt = 0;
    for(int i=0; i<dat.size(); i++){
      std::cout << dat_t[i] << "\t" << lr.predict(dat[i]) << "\t"; printf("%.8lf\n", lr.prob(dat[i]));

      if(dat_t[i] == lr.predict(dat[i])) cnt++;
    }
    //最終結果の表示
    std::cout << cnt << "/" << dat.size() << "  Acc=" << (double)cnt / dat.size() << std::endl;
  }

  return 0;
}

結果

学習ループのところでtrain/trainL1/trainL2を変えてそれぞれで確認。
パラメータは上記のコードのもの(適当)。

正則化なし
- 13848/16281 Acc=0.850562
L1正則化
- 13634/16281 Acc=0.837418
L2正則化
- 13747/16281 Acc=0.844358

正則化ものすごい重い。

参考まで、liblinear-1.93での結果(デフォルトパラメータ)。

liblinearのL1LR
- Accuracy = 85.0009% (13839/16281)
liblinearのL2LR
- Accuracy = 84.9886% (13837/16281)

速い。高い。うまい。

http://d.hatena.ne.jp/jetbead/20130510/1368120547
ちなみに、SCWのときの結果より高い数値でてる。。。