M. D. Zeiler, ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
- http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
学習率を自動調整する方法の一つ
他の関連手法の問題点等を改良
- 過去すべての勾配を考慮→直近の勾配だけを考慮したいので、指数関数的に減衰するように考慮
- グローバルな学習率を指定→second order methodsの特性を持つように、パラメータの変化について直近のパラメータの変化から計算し利用
- RMS(root mean square)の計算→小さい値を加えてから平方根をとる

コード

http://d.hatena.ne.jp/jetbead/20131129/1385658526
上記の学習率を定数ではなくAdaDeltaに置き換える。

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
#include <unordered_map>
#include <cmath>


//乱数計算用関数
//xorshift
// 注意: longではなくint(32bit)にすべき
unsigned long xor128(){
  static unsigned long x=123456789, y=362436069, z=521288629, w=88675123;
  unsigned long t;
  t=(x^(x<<11));
  x=y; y=z; z=w;
  return w=(w^(w>>19))^(t^(t>>8));
}
//[0,1)の一様乱数
double frand(){
  return xor128()%10000000000/static_cast<double>(10000000000); 
}


//ロジスティック回帰クラス
class LR_adadelta {
  double rho_, eps_;
  double C_;
  std::unordered_map<std::string,double> weights, eg2, edx2;

  //シグモイド関数
  double sigmoid(double z){
    return 1.0 / ( 1.0 + exp(-z) );
  }

  //σ関数
  double sigma(const std::vector< std::pair<std::string,double> >& x){
    double z = 0;
    for(size_t i=0; i<x.size(); i++){
      z += weights[ x[i].first ] * x[i].second ;
    }
    return sigmoid(z);
  }

  //RMS[x]
  double RMS(double x){
    return sqrt( x + eps_ );
  }

public:
  LR_adadelta(double rho, double eps, double C):
    rho_(rho),eps_(eps),C_(C)
  {}

  //素性の重みをランダム値で初期化
  void setRandWeight(const std::string& feat){
    weights[ feat ] = frand();
  }

  //予測関数
  int predict(const std::vector< std::pair<std::string,double> >& x){
    return sigma(x) > 0.5 ? 1 : 0;
  }
  //確率値の計算
  double prob(const std::vector< std::pair<std::string,double> >& x){
    return sigma(x);
  }
  
  //L2正則化ロジスティック回帰 + AdaDelta
  void trainL2(int t, const std::vector< std::pair<std::string,double> >& x){
    double pred = sigma(x);
    std::unordered_map<std::string,double> xx;
    for(size_t i=0; i<x.size(); i++){
      xx[x[i].first] = x[i].second;
    }

    std::unordered_map<std::string,double>::iterator itr;
    for(itr = weights.begin(); itr != weights.end(); ++itr){
      //Compute Gradient: g_t
      double g = ( (pred-t) * xx[itr->first] + C_ * weights[ itr->first ] );
      //Accumulate Gradient: E[g^2]_t = \rho E[g^2]_{t-1} + (1-\rho) * g^2_t
      eg2[ itr->first ] = rho_ * eg2[ itr->first ] + (1.0 - rho_) * (g * g);
      //Compute Update: \Delta x_t = - (RMS[\Delta x]_{t-1}) / (RMS[g]_t) * g_t
      double deltax = - RMS(edx2[ itr->first ]) / RMS(eg2[ itr->first ]) * g;
      //Accumulate Updates: E[\Delta x^2]_t = \rho E[\Delta x^2]_{t-1} + (1-\rho) * \Delta x^2_t
      edx2[ itr->first ] = rho_ * edx2[ itr->first ] + (1.0 - rho_) * (deltax * deltax);

      //Apply Update: x_{t+1} = x_t + \Delta x_t
      weights[ itr->first ] += deltax;
    }    
  }

  //素性の数
  int weights_num(){
    return weights.size();
  }
  
  //ファイルへ重みを出力
  void save(const std::string& filename){
    std::ofstream ofs(filename);
    std::unordered_map<std::string,double>::iterator itr = weights.begin();
    for(; itr != weights.end(); ++itr){
      ofs << itr->first << " " << itr->second << std::endl;
    }
  }

  //ファイルから重みを入力
  void load(const std::string& filename){
    std::ifstream ifs(filename);
    weights.clear();
    
    std::string name;
    double value;
    while(ifs >> name >> value){
      weights[ name ] = value;
    }
  }
};


int main(){

  //パラメータ
  int iter_num = 1; //学習の反復回数
  double rho = 0.95; //減衰率
  double eps = 0.00001; //小さい定数
  double C = 0.001; //正則化のパラメータ
  std::ifstream trainfile("./a9a"); //学習データファイル
  std::ifstream testfile("./a9a.t"); //評価データファイル

  LR_adadelta lr(rho, eps, C);
  std::vector<int> dat_t;
  std::vector< std::vector< std::pair<std::string,double> > > dat;
  std::string line, feat;

  //学習////////////////////////////////////////////////
  //学習データの読み込み
  while(std::getline(trainfile, line)){
    int t;
    std::stringstream ss(line);
    ss >> t;
    if(t>0) t = 1;
    else t = 0;
    dat_t.push_back(t);
    
    std::vector< std::pair<std::string,double> > v;
    while(ss >> feat){
      std::string::size_type idx = feat.find(":");
      std::string name = feat.substr(0, idx);
      double value = atof(feat.substr(idx+1).c_str());

      v.push_back(std::make_pair(name, value));
      lr.setRandWeight(name); //素性の重みをランダム値で初期化しておく
    }
    dat.push_back(v);
  }
  //学習ループ
  for(int t=0; t<iter_num; t++){
    for(int i=0; i<dat.size(); i++){

      lr.trainL2(dat_t[i], dat[i]);

      //現在の学習データ全部での評価値を表示
      //int cnt = 0, cnt_zero = 0;
      //for(int j=0; j<dat.size(); j++){
      //  if(dat_t[j] == lr.predict(dat[j])) cnt++;
      //}
      //std::cerr << (double)cnt / dat.size() << std::endl;
    }
  }

  //重みデータの出力
  lr.save("weights.txt");



  //予測////////////////////////////////////////////////
  dat_t.clear();
  dat.clear();
  lr.load("weights.txt"); //重みデータの読み込み

  //テストデータの読み込み
  while(std::getline(testfile, line)){
    int t;
    std::stringstream ss(line);
    ss >> t;
    if(t>0) t = 1;
    else t = 0;
    dat_t.push_back(t);
    
    std::vector< std::pair<std::string,double> > v;
    while(ss >> feat){
      std::string::size_type idx = feat.find(":");
      std::string name = feat.substr(0, idx);
      double value = atof(feat.substr(idx+1).c_str());

      v.push_back(std::make_pair(name, value));
    }
    dat.push_back(v);
  }

  //評価
  {
    int cnt = 0, cnt_zero = 0;
    for(int i=0; i<dat.size(); i++){
      //std::cout << dat_t[i] << "\t" << lr.predict(dat[i]) << "\t"; printf("%.8lf\n", lr.prob(dat[i]));

      if(dat_t[i] == lr.predict(dat[i])) cnt++;
    }
    //最終結果の表示
    std::cout << cnt << "/" << dat.size() << "  Acc=" << (double)cnt / dat.size() << std::endl;
  }

  return 0;
}

結果

$ ./a.out
13750/16281  Acc=0.844543

学習データをt事例まで学習したとき(横軸)、学習データの事例全部での評価値(縦軸)の変化をプロット。
青線が、学習率が定数の場合。赤線が、AdaDeltaの場合。

ちゃんと定数の場合より速く収束している(のでたぶん大丈夫)。。。
ハイパーパラメータは、epsilonが小さいと評価値は高まる傾向みたいだけど、収束が遅くなってしまうみたい。。

参考

2014-11-19

ベータ分布のquantile

統計プログラミング

はじめに

boostには、確率分布のquantile(分位数、分布をp:1-pに分割する点)を計算するものが用意されている。
ベータ分布の場合について、自分でも書いてみる。

コード

#include <iostream>
#include <cmath>
//#include <boost/math/distributions/beta.hpp>

class BetaDistribution {
  double eps;
  double val_a, val_b;

  //Ix : regularized incomplete beta function
  double Ix(double x, double a, double b){
    if(a <= 0) return -1;
    if(b <= 0){
      if(x < 1) return 0;
      if(fabs(x-1) < eps) return 1.0;
      return -1;
    }

    if(x > (a+1) / (a+b+2)) return 1 - Ix(1-x, b, a);
    if(x <= 0) return 0;
    double p1 = 0, q1 = 1;
    double p2 = exp(a*log(x) + b*log(1-x) + lgamma(a+b) - lgamma(a) - lgamma(b)) / a, q2 = 1;
    double prev, d;
    for(int k=0; k<500;){
      prev = p2;
      d = -((a+k) * (a+b+k) * x) / ((a+2*k) * (a+2*k+1));
      p1 = p1*d + p2;
      q1 = q1*d + q2;
      k++;
      d = (k * (b-k) * x) / ((a+2*k-1) * (a+2*k));
      p2 = p2*d + p1;
      q2 = q2*d + q1;
      if(fabs(q2) < eps){
        q2 = 1e+9;
        continue;
      }
      p1 /= q2;
      q1 /= q2;
      p2 /= q2;
      q2 = 1;
      if(fabs(p2-prev) < eps) return p2;
    }
    return -1;
  }
public:
  BetaDistribution(double val_a, double val_b):
    val_a(val_a),
    val_b(val_b),
    eps(1e-5) 
  {}
  
  //Int_0^Q { dPhi(x) } >= p
  double quantile(double p){
    double lb = 0.0, ub = 1.0;
    for(int i=0; i<200; i++){
      double m = (lb+ub) / 2.0;
      if(Ix(m, val_a, val_b) < p) lb = m;
      else ub = m;
    }
    return lb;
  }
};

int main(){
  double a, b, p;

  while(std::cin >> a >> b >> p){
    //boost
    //boost::math::beta_distribution<> dist(a, b);
    //std::cout << quantile(dist, p) << std::endl;
    
    //mybeta
    BetaDistribution mybeta(a, b);
    std::cout << mybeta.quantile(p) << std::endl;
  }

  return 0;
}

regularized不完全ベータ関数Ix(a,b)は「C言語による最新アルゴリズム辞典」で紹介されている連分数展開による方法。
quantileの計算は、2分探索で探している・・・
(一応、boostの結果と値はだいたい同じみたいだが、pが0や1に近い点だと誤差が大きくなっている模様)

参考

2014-11-06

Feature Hashingを試す

機械学習自然言語処理プログラミング

はじめに

Feature Hashingについて気になったことがあったので試してみた。

Feature Hashingとは

Hashing trick
ハッシュ関数を使って、素性群をM次元ベクトルにする
- 一種の次元圧縮
Bag of wordsなどの素性をそのままハッシュ値にすることで、素性とIDのペアの辞書などが必要なくなる
- スパムフィルタでは、新語やミススペルでフィルタ回避されてしまうと対応すべき語が増え続ける(辞書が大きくなる)問題などに使える

ベクトルの作り方

いくつか提案されているが、各素性のhash値を計算してmod Mをとったインデクスの所に入れるものとしては主に2つがあるようなので、メモしておく。

Shiらの方法

Shiら(2009)
値をunsigned sumする
φ_i (x) = Σ_{ j:h(j)=i } x_j
- h : ハッシュ関数

Weinbergerらの方法

Weinbergerら(2009)
値をsigned sumする
φ_i (x) = Σ_{ j:h(j)=i } ξ(j) * x_j
- ξ : 1か-1を返すハッシュ関数
こちらの場合は、いくつか条件下で、ハッシュ化したφ(x)とφ(x')の内積の値の期待値は、もとのxとx'の内積の値なる(unbiased)

ちょっとした実験

ハッシュ関数による違いみたいなものがどのぐらいあるのか気になったので、試してみる。
liblinear形式とかの「a:b」のaの数字部分を文字列として扱ってFeatureHashingするみたいなことをしても別に問題ないはずだけど、本当に問題ないのか気になったので、一緒に試してみる。

news20.binaryのデータを使う。
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#news20.binary

ハッシュ関数は書きやすそうなものとして、naiveな感じのものとFNV-1とmurmurhashで試してみる(深い意味はなし)。
Weinbergerらの方法でベクトルを作る。
Mの値は、2のベキ乗を使った方がよいかもしれないが、50万、10万、5万、1万、5000で見てみる。
liblinear-1.94を使用、オプションは「-v 2」以外全部デフォルト(C=1,L2正則化L2Loss SVC dual)。

$ ./a.out 10000 < news20.binary > news20.binary.10000
$ train -v 2 news20.binary.10000

コード

#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <map>
#include <cmath>

class FeatureHashing {
  std::map<int,std::map<std::string,int> > tbl;

  unsigned int hash(const std::string& str) {
    return hash_murmur(str);
    //return hash_FNV1(str);
    //return hash_naive(str);
  }

  //murmurhash
  // http://en.wikipedia.org/wiki/MurmurHash
  unsigned int hash_murmur(const std::string& str) {
    const char *key = str.c_str();
    unsigned int len = str.length();
     static const unsigned int c1 = 0xcc9e2d51;
     static const unsigned int c2 = 0x1b873593;
     static const unsigned int r1 = 15;
     static const unsigned int r2 = 13;
     static const unsigned int m = 5;
     static const unsigned int n = 0xe6546b64;
   
     unsigned int hash = 123456789U;
   
     const int nblocks = len / 4;
     const unsigned int *blocks = (const unsigned int *) key;
     int i;
     for (i = 0; i < nblocks; i++) {
      unsigned int k = blocks[i];
      k *= c1;
      k = (k << r1) | (k >> (32 - r1));
      k *= c2;
     
      hash ^= k;
      hash = ((hash << r2) | (hash >> (32 - r2))) * m + n;
     }
   
     const unsigned char *tail = (const unsigned char *) (key + nblocks * 4);
     unsigned int k1 = 0;
   
     switch (len & 3) {
     case 3:
      k1 ^= tail[2] << 16;
     case 2:
      k1 ^= tail[1] << 8;
     case 1:
      k1 ^= tail[0];
     
      k1 *= c1;
      k1 = (k1 << r1) | (k1 >> (32 - r1));
      k1 *= c2;
      hash ^= k1;
     }
   
     hash ^= len;
     hash ^= (hash >> 16);
     hash *= 0x85ebca6b;
     hash ^= (hash >> 13);
     hash *= 0xc2b2ae35;
     hash ^= (hash >> 16);
   
     return hash;
  }
 
  //FNV-1
  // http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
  unsigned int hash_FNV1(const std::string& str){
    unsigned int hash;

    hash = 2166136261U;
    for(int i=0; i<str.length(); i++){
      hash = (16777619U * hash) ^ (str[i]);
    }
   
    return hash;
  }
 
  //naive hash
  unsigned int hash_naive(const std::string& str){
    unsigned int h = 0;
    for(int i=0; i<str.length(); i++){
      h = h * 31 + str[i];
    }
    return h;
  }


  //binary hash(string -> {1,-1})
  int binary_hash(const std::string& str){
    unsigned int h = 0;
    for(int i=0; i<str.length(); i++){
      h = h * 37 + str[i];
    }
    return (h % 2 == 1) ? 1 : -1;
  }

public:
  FeatureHashing() { }

  std::vector<double> vectorize(int M, const std::map<std::string,double>& fts){
    std::vector<double> x(M,0);
    for(std::map<std::string,double>::const_iterator i=fts.begin(); i!=fts.end(); ++i){
      unsigned int h = hash(i->first);
      int xi = binary_hash(i->first);
      x[ h % M ] += xi * i->second;
      tbl[ h % M ][i->first] = 1;
    }
    return x;
  }

  void dump(int M){
    int n = 0;
    int zero = 0;
    int sum = 0;
    for(std::map<int, std::map<std::string,int> >::iterator i=tbl.begin(); i!=tbl.end(); ++i){
      //std::cout << (i->second).size() << std::endl;
      n++;
      sum += (i->second).size();
      if((i->second).size() == 0) zero++;
    }
    std::cerr << "N : " << n << std::endl;
    std::cerr << "Zero : " << zero << std::endl;
    std::cerr << "Ave. of hash collision : " << (sum/(double)M) << std::endl;
  }
};


std::vector<std::string> split(const std::string &str, const char c){
  std::vector<std::string> ret;
  std::string tmp = "";
  for(int i=0; i<str.length(); i++){
    if(tmp != "" && str[i] == c){
      ret.push_back(tmp);
      tmp = "";
    }
    else if(str[i] != c){
      tmp += str[i];
    }
  }
  if(tmp != "") ret.push_back(tmp);
  return ret;
}


int main(int argc, char** argv){
  int M = 0;
  if(argc == 2){
    std::stringstream ss(argv[1]);
    ss >> M;
  }
  if(M == 0) return 1;
  std::cerr << "M = " << M << std::endl;



  FeatureHashing fh;
  std::string line;
  int cnt = 0;
  while(std::getline(std::cin, line)){
    std::cerr << "\r" << cnt++;
    std::vector<std::string> sp = split(line, ' ');

    std::map<std::string,double> mp;
    for(int i=1; i<sp.size(); i++){
      double val;
      std::vector<std::string> fsp = split(sp[i], ':');
      std::stringstream ss(fsp[1]);
      ss >> val;
      mp[fsp[0]] = val;
    }

    std::vector<double> fvec = fh.vectorize(M, mp);

    std::cout << sp[0];
    for(int i=0; i<fvec.size(); i++){
      if(fabs(fvec[i])<1e-8) continue;
      std::cout << " " << i+1 << ":" << fvec[i];
    }
    std::cout << std::endl;
  }

  std::cerr << std::endl;
  fh.dump(M);

  return 0;
}

結果

表の一番上は何もしない場合の結果(ベースライン)。

naiveな感じの

Mの大きさ	使ったハッシュ値の種類数	2-fold cv Accuracy
(1,355,191)	-	(95.5791%)
500,000	466,615	95.094%
100,000	100,000	94.5439%
50,000	50,000	93.8188%
10,000	10,000	91.3983%
5,000	5,000	88.7678%

FNV-1

Mの大きさ	使ったハッシュ値の種類数	2-fold cv Accuracy
(1,355,191)	-	(95.5791%)
500,000	467,764	95.4441%
100,000	100,000	94.979%
50,000	50,000	94.3989%
10,000	10,000	91.4183%
5,000	5,000	88.7828%

murmurhash

Mの大きさ	使ったハッシュ値の種類数	2-fold cv Accuracy
(1,355,191)	-	(95.5791%)
500,000	467,088	95.5391%
100,000	100,000	95.059%
50,000	50,000	94.7139%
10,000	10,000	91.6183%
5,000	5,000	88.6777%

(M=50万のとき、使われないハッシュ値ができてる)
次元が1/10ぐらいになってもAccuracyがほとんど減ってない。
ハッシュ関数がかわると大きく変わるかなと思ったけど、そんなに大きな変化はしないみたい。

参考

2014-10-15

Friedman testとNemenyi testメモ

統計

はじめに

複数のアルゴリズムの結果の有意差検定に使用されていたので、メモ。

より詳細に紹介されているのは以下の論文。
Demsar, Statistical Comparisons of Classifiers over Multiple Data Sets, 2006
http://jmlr.csail.mit.edu/papers/volume7/demsar06a/demsar06a.pdf
1999年から2003年のICMLの論文で使われた検定について調査して、アルゴリズムやデータセットの種類によって使うべき検定方法を紹介している。

Friedman test

対応のある順序尺度の3群以上の平均順位に有意差があるかどうかを検定(ノンパラメトリック検定)
- repeated-measures ANOVAのノンパラメトリックに相当
帰無仮説(H0) : μ_0=μ_1=…=μ_k (すべての平均順位が等しい)
検定方法
- χ^2_F = 12n/{k(k+1)} Σ_{i=1}^k { (R_i)^2 - k(k+1)^2 / 4 }を計算
- 自由度(k-1)のχ^2検定をする。nやkが小さい場合はχ^2分布に近似しないので、検定表などを利用
- k : 群数
- n : データ数
- Ri : 群iの順位平均(タイがある場合はそれらの平均順位を利用)
  - r_i^j : i番目のデータセットで、j番目のアルゴリズムの順位(1位、2位・・・)
  - Ri = 1/n * Σ_i { r_i^j }

χ^2_Fは、nとkが十分大きいとき(n>10, k>5)、k-1自由度のχ^2分布になるとされるが、
F_F = { (n-1)χ^2_F } / { n(k-1) - χ^2_F }がk-1と(k-1)(n-1)自由度のF分布になることも示されており、こちらの方がよりよい統計量であることが示されているっぽい。

平均順位とは、「1.0、0.5、0.5、0.1」のようなスコアが出たとして、タイの順位は2位と3位に該当するので、その平均を取って、「1位、2.5位、2.5位、4位」のようにする。

http://www.snap-tck.com/room04/c01/stat/stat04/stat0403.html
http://www2.hak.hokkyodai.ac.jp/fukuda/lecture/SocialLinguistics/12nonpara.html

post-hoc test (事後検定)

単純に、各群のペアに対して検定(多重検定)を行ってしまうと、第1種過誤となる確率が設定した有意水準以上で出る可能性が出てしまう
そこで、有意差があるかどうか検定し、帰無仮説が棄却されたことを前提に行う検定のこと

Nemenyi test

post-hoc testの一つ
- ANOVAに対するTukey testに似ている
- http://ja.wikipedia.org/wiki/%E3%83%86%E3%83%A5%E3%83%BC%E3%82%AD%E3%83%BC%E3%81%AE%E7%AF%84%E5%9B%B2%E6%A4%9C%E5%AE%9A
平均順位の差が、以下のcritical difference(CD)より大きければ有意差があるとする
- CD = q_α sqrt{ k(k+1)/(6n) }
- q_α : Studentized range statistic(スチューデント化された範囲分布) divided by sqrt2に基づくcritical value

2014-10-12

DSIRNLP#6で発表させていただきました＆懺悔とNaiveBayes教入信

機械学習

DSIRNLP#6

10/11にデンソーアイティーラボラトリさんで行われたDSIRNLP#6勉強会で発表させていただきました
聴いていただいた方、ありがとうございました。

勉強会のページ
- http://partake.in/events/38e416b0-5e64-4bd4-8388-4e19acd0ef97
発表資料
- 一部、発表時の資料を修正しています
- 主だって参考にした論文は以下になります
  - Zheng&Webb, Semi-naive Bayesian Classification, 2008
  - http://www.csse.monash.edu.au/~webb/Files/ZhengWebb08a.pdf

No Bayes No Life -Naive Bayesは今でも進化しているようです。- from phyllo

補足(2014/10/12追記修正しました)

質問への回答で、「ナイーブベイズの改良は重箱の隅をつつくような感じ(みんな思いつくけどまだやられていない/確認できていない、インパクトが大きくない、ような意で言いました)かも」「ナイーブベイズの改良は一部で流行っているイメージがあるかも」など言ってしまっている部分がありますが、日頃この周辺の研究や動向を調査しているわけではなく、さらに論文を見ていっただけの印象でしかなく、不正確な回答をしてしまっている可能性が高いです。申し訳ありません・・・

これを簡易的に確認するため、上記の主だって参考にした論文のReference部分の93文献について人力パースし確認してみると、以下のようになっていました。

(左＝出現著者のソート降順。著者の出現数＝著者内に出現した回数をカウント。出現年の頻度＝各論文・書籍などがパブリッシュされた年度)
前者については、文献の出現年について集計してみると、1995年あたりから増えており、2005年あたりにAODEが出たあたりで最大になっているようです。新しい考え方(AODE)が提案されては盛り上がっているように見えますが、個人的には新しい道を作るような研究がぽんぽんでている印象は持っていなく、今後他のいろんなアイデアを取り込んでいってほしいなと思います。
後者については、SemiNaiveBayes分野において、一部の多数の成果・貢献をしている方々がいらっしゃるようですが、ロングテールな分布となっており、「一部の人はめっちゃやってるが、他の人もコミットしていないわけではない」ようです。個人的には、「ナイーブベイズ」自体が広く知られている手法なのに対して、読んでた論文にでてくる人数がそこまで多くない印象を持っていました。調査が足りないと思うので、これからそこら辺を確認していきたいと思います。

詳しい方で補足などありましたら、お願いします。

懺悔とNaive Bayes教入信

準備＆勉強不足で、発表がグダグダで申し訳ありませんでした。が、それでもいろいろなコメントやご意見をいただき、ありがとうございました。
今回の勉強会で、ナイーブベイズとその改良についてより深く学び、より多くの知見を得たいと考え、今回その存在を知ったNaive Bayes教に入信することにしました。

特にNaive Bayesの利用を推進したりするわけではありませんが、NaiveBayes周りについて論文調査や実装、周辺情報についてブログでまとめていきたいと思います。

2014-09-16

NBSVMを試す

機械学習自然言語処理プログラミング

はじめに

S. Wang & C. D. Manning, Baselines and Bigrams: Simple, Good Sentiment and Topic Classificatioin
Naive Bayes素性を利用したSVM(NBSVM)なるものを試してみる。

SVM with NB features(NBSVM)

Log-count ratio r = log( (p / ||p||_1) / (q / ||q||_1) )
- 正例カウントベクトル p = α + Σ_{i:y_i=1} f_i
- 負例カウントベクトル q = α + Σ_{i:y_i=-1} f_i
  - f_i : 各事例iにおける素性ベクトル
  - α : スムージング用パラメータ
モデル w' = (1-β) * w~ + β * w
- w~ : ||w||_1 / |V|
- β : 補間パラメータ(0〜1)

実験ではliblinearを利用して、前処理としてLog-count ratioの計算、モデル学習後に補間モデルの計算、をしている模様(MATLAB)。

使用したデータ

LIBSVMに置いてあるnews20.binaryを利用した
- http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#news20.binary
- 正例はsci.*, comp.*, misc.forsalesの10カテゴリ、負例はそれ以外の10カテゴリ
- 素性については詳細が論文に書かれているが、0/1バイナリ素性を事例ごとにノーマライズ(単位ベクトル化)してあるらしい
時系列を無視して、正例と負例それぞれをシャッフルし、それぞれ8000件ずつを学習用、残りを評価用にした
- 正例数 : 学習8000 + 評価1999
- 負例数 : 学習8000 + 評価1997

コード

liblinear形式のデータをそれぞれ直すためにスクリプトを準備。非常に雑。

学習データからLCRを計算(calc_lcr.pl)

#!/usr/bin/perl
use strict;
use warnings;

my $alpha = shift;

my %pos_count_vector;
my %neg_count_vector;

while(<>){
    chomp;
    my @line = split(/\s+/);

    for(my $i=1; $i<@line; $i++){
        my ($id, $val) = split(/:/, $line[$i]);

        if($line[0] > 0){
            $pos_count_vector{$id}++;
        }else{
            $neg_count_vector{$id}++;
        }
    }
}

my %pos_p;
my %neg_p;
my $sum;

$sum = 0;
foreach my $id (keys %pos_count_vector){
    $sum += $alpha + $pos_count_vector{$id};
    $pos_p{$id} = log($alpha + $pos_count_vector{$id});
}
foreach my $id (keys %pos_count_vector){
    $pos_p{$id} -= log($sum);
}

$sum = 0;
foreach my $id (keys %neg_count_vector){
    $sum += $alpha + $neg_count_vector{$id};
    $neg_p{$id} = log($alpha + $neg_count_vector{$id});
}
foreach my $id (keys %neg_count_vector){
    $neg_p{$id} -= log($sum);
}

my @ids = grep{ our %h; ++$h{$_} < 2 }(keys %pos_p, keys %neg_p);

foreach my $id (sort {$a <=> $b} @ids){
    my $p = exists $pos_p{$id} ? $pos_p{$id} : 0;
    my $n = exists $neg_p{$id} ? $neg_p{$id} : 0;
    print $id, "\t", ($p - $n), "\n";
}

liblinear形式データをlcrデータを使って素性値置き換え(data2lcr.pl)

#!/usr/bin/perl
use strict;
use warnings;

my $lcr_file = shift;

my %lcr;
open(IN, "<", $lcr_file) or die;
while(<IN>){
    chomp;
    my ($id, $val) = split(/\t/);
    $lcr{$id} = $val;
}

while(<>){
    chomp;
    my @line = split(/\s+/);

    print $line[0];

    my $sz = 0;
    for(my $i=1; $i<@line; $i++){
        my ($id, $val) = split(/:/, $line[$i]);
        my $new_val = exists $lcr{$id} ? $lcr{$id} : 0;
        $sz += $new_val * $new_val;
    }
    $sz = sqrt($sz);
    
    for(my $i=1; $i<@line; $i++){
        my ($id, $val) = split(/:/, $line[$i]);
        my $new_val = exists $lcr{$id} ? $lcr{$id} : 0;

        print " ", $id, ":", (($sz<1e-8)?$new_val:($new_val/$sz));
    }
    print "\n";
}

学習したモデルを修正(model_modif.pl)

#!/usr/bin/perl
use strict;
use warnings;

my $beta = shift;

my @w;
my $sum = 0;
my $flg = 0;
while(<>){
    chomp;
    if($_ eq 'w'){
        $flg = 1;
        next;
    }
    if($flg == 0){
        print $_, "\n";
    }else{
        push @w, $_;
        $sum += abs($_);
    }
}

print "w\n";
foreach my $val (@w){
    my $new_val = (1.0 - $beta) * ($sum / scalar(@w)) + $beta * $val;
    print $new_val, "\n";
}

結果

liblinearのパラメータは、s=2を用いる以外は全部デフォルトでAccuracyをみてみる。
(s=1だとバイナリ素性のときイテレーション回数最大値までいってしまうため)

#!/bin/zsh

#正例負例
#grep "^+1" news20.binary > news20.binary.pos
#grep "^-1" news20.binary > news20.binary.neg

#データをシャッフル
#perl -MList::Util=shuffle -e 'print shuffle(<>)' < news20.binary.neg >news20.binary.neg.shuf
#perl -MList::Util=shuffle -e 'print shuffle(<>)' < news20.binary.pos >news20.binary.pos.shuf

#データの前半と後半を、学習用と評価用に分ける
head -8000 news20.binary.pos.shuf > news20.binary.pos_train
head -8000 news20.binary.neg.shuf > news20.binary.neg_train
tail -n +8001 news20.binary.pos.shuf > news20.binary.pos_test
tail -n +8001 news20.binary.neg.shuf > news20.binary.neg_test

#学習用と評価用データ
cat news20.binary.pos_train news20.binary.neg_train > train
cat news20.binary.pos_test news20.binary.neg_test > test

#LCRの計算とデータの素性値を置換
perl ./bin/calc_lcr.pl 0.25 < train > train.lcr
perl ./bin/data2lcr.pl train.lcr < train > train.new
perl ./bin/data2lcr.pl train.lcr < test > test.new
#学習
./liblinear/liblinear-1.94/train -s 2 train.new
for i in {0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1}; do 
  perl ./bin/model_modif.pl $i < train.new.model > model.beta$i;
done
#評価
for i in {0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1}; do
  echo -n $i"\t"; 
  ./liblinear/liblinear-1.94/predict test.new model.beta$i out; 
done


#ベースラインの計算(素性値を0/1のバイナリ値にした場合)
perl ./bin/data2binary.pl < train > train.binary
perl ./bin/data2binary.pl < test > test.binary
./liblinear/liblinear-1.94/train -s 2 train.binary
./liblinear/liblinear-1.94/predict test.binary train.binary.model out

#ベースラインの計算(素性値を0/1のバイナリ値にしたあと、normalize(単位ベクトル化))
./liblinear/liblinear-1.94/train -s 2 train
./liblinear/liblinear-1.94/predict test train.model out

バイナリ素性
- Accuracy = 95.5205% (3817/3996)
バイナリ素性＋単位ベクトル化
- Accuracy = 96.5215% (3857/3996)
α=0.25でLCR値＋単位ベクトル化
- β=0.0 : Accuracy = 4.02903% (161/3996)
- β=0.1 : Accuracy = 11.8118% (472/3996)
- β=0.2 : Accuracy = 34.1592% (1365/3996)
- β=0.3 : Accuracy = 69.8699% (2792/3996)
- β=0.4 : Accuracy = 90.6156% (3621/3996)
- β=0.5 : Accuracy = 95.8959% (3832/3996)
- β=0.6 : Accuracy = 97.1221% (3881/3996)
- β=0.7 : Accuracy = 97.3724% (3891/3996)
- β=0.8 : Accuracy = 97.3974% (3892/3996)
- β=0.9 : Accuracy = 97.3974% (3892/3996)
- β=1.0 : Accuracy = 97.3724% (3891/3996)

ベースラインよりも良い結果が得られているっぽい。(パラメータのβが1.0じゃないところで最大値になっている)
データにも依るかもしれないけど、ノーマライズやTFIDFなど素性値を変えてみる一つとして有用そう。

参考

2014-09-05

Interpolation Search

プログラミング

Interpolation Searchとは

補間探索、内挿探索
二分探索での範囲の中間値を利用する代わりに、範囲の両端の値から探したい値の位置にあたりをつけて、その値を利用して探索していく方法
- 二分探索では、配列の値に関係なく範囲の中間の値を利用して探索
探索時間はO(log log n)
- 二分探索では、O(log n)
- データに依っては、二分探索の方が速くなる場合もあり得る

計算例

配列vが「1,2,3,5,6,10,12」として、
left = 0, v[left] = 1
right = 6, v[right] = 12
である時、
2分探索では、mid = floor( (left + right) / 2 ) = floor( (0+6) / 2 ) = 3のように決めたりするが、
Interpolation Searchでは、値も考慮し、value=6を探したい場合、
mid = floor( left + (value - v[left]) * (right - left) / (v[right] - v[left]) ) = floor( 0 + (6-1) * (6-0) / (12-1) ) = 2のように決める。
計算的に、一様に値が分布していることを仮定して値の位置を計算しているので、一様であれば有効に働く。

コード

#include <iostream>
#include <vector>

template<class T>
class InterpolationSearch {
public:
  //配列vから値valueを探し、そのindexを返す
  //存在しない場合は-1を返す
  int search(const std::vector<T>& v, const T& value){
    if(v.size() == 0) return -1;

    int left = 0;
    int right = v.size()-1;
    
    if(left == right) return (v[left] == value) ? left : -1;
    if(value < v[left] || v[right] < value) return -1;

    while(true){
      int mid = (int)( left + (long long)(value - v[left]) * (right - left) / (v[right] - v[left]) );
      
      if(v[mid] == value){
        return mid;
      }
      else if(v[mid] < value){
        left = mid + 1;
      }
      else if(value < v[mid]){
        right = mid - 1;
      }
      if(value < v[left] || v[right] < value) break;
    }
    return -1;
  }
};


int main(){

  InterpolationSearch<int> is;

  //0〜99の、index == valueなソート済み配列を用意
  std::vector<int> v;
  for(int i=0; i<100; i++){
    v.push_back(i);
  }

  //配列から-1〜100を検索
  for(int i=-1; i<101; i++){
    std::cout << is.search(v, i) << std::endl;
  }

  return 0;
}

ところで文字列の場合、ソートはいいとしても、「値の差の大きさ」も必要になるので、どうするのがいいかなと思って調べてみたら、StackOverflowで同様の質問がでていた。
自明じゃないし、二分探索より遅くなる可能性もあるし、結局実用には難しいか・・・

参考

2014-07-31

マルチラベル分類メモ

機械学習

はじめに

G. Tsoumakas, I. Katakis, I. Vlahavas., Mining Multi-label Data
http://lpis.csd.auth.gr/paper_details.asp?publicationID=290
マルチラベル分類問題について、メモ。

マルチラベル分類問題

1つの事例が、複数のラベル(ラベルの集合)に同時に分類されうる分類問題
- 例：「ダビンチコード」の記事のカテゴリ→宗教、映画
マルチラベルの教師あり学習では、主に以下のタスクがある
- マルチラベルクラス分類(multi label classification)
- ラベルランキング(label ranking)
また、マルチラベル学習の方法は、主に2つのグループに分けられる
- Problem Transformation
- Algorithm Adaptation

シングルラベル問題へ変換(Problem Transformation)

single label classifierを使う
学習データはどのようにシングルラベル用に変換すればよいか？

copy transformation

マルチラベルの事例を、ついているラベル数個の事例にコピーする
いくつかのバリエーションがある
- copy
  - 事例についている複数ラベルのそれぞれをシングルラベルとする事例を用意する
- dubbed copy-weight
  - copyし、その時の事例の重みを1/事例数にする
- select family
  - 事例についている複数ラベルから1つ選んでそれだけにする(他を無視する)
  - select-max, select-min, select-random
- ignore
  - 複数ラベルの事例は無視してしまう

label powerset

冪集合をラベルとして扱う
- 例：1,2,3のラベルがある場合は、1,2,3,1&2,1&3,2&3,1&2&3をラベルとして扱う

pruned problem transformation

label powersetで、出現数(2とか3とかユーザー定義)が少ないラベルを無視する

RAndom k-labELsets method(RAkEL)

http://lpis.csd.auth.gr/publications/tsoumakas-tkde10.pdf
k-labelsetとは、ラベル集合Lの部分集合R(k=|R|)のこと
いろんな組み合わせの部分集合について、label powersetして、アンサンブル学習

binary relevance

よく使われる方法
各ラベルについて、関連するかしないかのbinary分類器を作成し、各分類器の結果の和集合を出力する

ranking by pairwise comparison(RPC)

ラベル集合から、その事例についてどちらか片方だけラベルが正解になるような2ラベルを取りだす
その2ラベルを区別する2値分類を学習し、予測では各ラベルについて投票してランキングする

calibrated label ranking(CLR)

RPCに仮想ラベルを導入し、binary relevance

INSDIF algorithm

http://cse.seu.edu.cn/people/zhangml/files/Multi-label%20learning%20by%20instance%20differentiation.pdf

マルチラベルに対応したアルゴリズムの適用(Algorithm Adaptation)

そもそもマルチラベルデータを扱えるアルゴリズムを適用する方法もある

Decision Tree, Boosting

C4.5 algorithm
AdaBoost.MH, AdaBoost.MR

Probabilistic Methods

probabilistic generative model for mutil-label document
conditional random fields

Neural Networks, SVM

BP-MLL
ML-RBF
multi-class multi-label perceptron (MMP)
SVM algorithm (minimize the ranking loss)

Lazy, Associative Methods

k-NNのlazy拡張
MMAC
手法をcombineしたもの

参照

2014-07-15

Graph of Word、TW-IDFとTFのnormalizationメモ

自然言語処理

はじめに

Rousseau et al., Graph-of-word and TW-IDF: New Approach to Ad Hoc IR
http://www.lix.polytechnique.fr/~rousseau/papers/rousseau-cikm2013.pdf
文書dのグラフ的表現とそこから計算されるTW-IDFというTermの重み付けについて、メモ。

Graph of Word

文書を重みなし有向グラフで表現
- 頂点: 各(unique)term
- 辺: 固定幅(4ぐらい?)の窓内のtermとの共起
- 辺の向き: termの出現順序(前から後ろ方向のみ)
  - 多重辺にはしない

TW-IDF

TW-IDF(t,d) = tw(t,d) / (1-b+b*|d|/avdl) * log( (N+1) / df(t) )
- tw(t,d): 文書dのgraph of word表現における頂点tの重み(term weight)
- b: [0,1]のパラメータ(0.003)
- N: 文書数
tw(t,d)には「入次数」を使うのがよいらしい
- 他には以下なども考えられたが、計算コストなどの問題から簡単な「次数」を利用
  - closeness(頂点間の最小距離)
  - betweenness(頂点間の最短パスの頂点の出現頻度)
  - eigenvectors(隣接行列のspectral decomposition)
  - PageRank

TF normalization

TFの正規化についても紹介されている。

1から2への変化と100から101への変化を同様に扱うより、前者を大きく扱う方がよかったりする。
以下のようなものがある。

concave function(TF_k, TF_l)
- TF_k(t,d) = (k_1+1)*tf(t,d) / (k_1 + tf(t,d))
- TF_l(t,d) = 1 + ln(1+ln(tf(t,d)))
pivoted document length normalization(TF_p)
- TF_p(t,d) = tf(t,d) / (1-b+b*|d|/avdl)
lower-bounding regularization(TF_δ)
- TF_δ(t,d) = tf(t,d)+δ if tf(t,d)>0, otherwise 0
  - tf(t,d): ドキュメントdにおけるterm tの頻度
  - k_1: corresponding to the asymptotical maximal gain achievable by multiple occurrences compared to a single occurrence
  - b: [0,1]のパラメータ
  - |d|: 文書の長さ
  - avdl: 全文書における文書の平均の長さ
  - δ: lower bounding gap(1.0)

http://d.hatena.ne.jp/jetbead/20110904/1315147130

去年の反省

今年の抱負

その他の個人的な目標

はじめに

AdaDeltaとは

関連手法

コード

結果

参考

はじめに

コード

参考

はじめに

Feature Hashingとは

ベクトルの作り方

Shiらの方法

Weinbergerらの方法

ちょっとした実験

コード

結果

参考

はじめに

Friedman test

post-hoc test (事後検定)

Nemenyi test

DSIRNLP#6

補足(2014/10/12追記修正しました)

懺悔とNaive Bayes教入信

はじめに

SVM with NB features(NBSVM)

使用したデータ

コード

学習データからLCRを計算(calc_lcr.pl)

liblinear形式データをlcrデータを使って素性値置き換え(data2lcr.pl)

学習したモデルを修正(model_modif.pl)

結果

参考

Interpolation Searchとは

計算例

コード

参考

はじめに

マルチラベル分類問題

シングルラベル問題へ変換(Problem Transformation)

copy transformation

label powerset

pruned problem transformation

RAndom k-labELsets method(RAkEL)

binary relevance

ranking by pairwise comparison(RPC)

calibrated label ranking(CLR)

INSDIF algorithm

マルチラベルに対応したアルゴリズムの適用(Algorithm Adaptation)

Decision Tree, Boosting

Probabilistic Methods

Neural Networks, SVM

Lazy, Associative Methods

参照

はじめに

Graph of Word

TW-IDF

TF normalization