トピックモデルメモ - Negative/Positive Thinking

はじめに

トピックモデルについてメモ。

トピックモデルとは

文書は、何らかの話題について書かれていたりする
- 「ある文書内に一緒にでてくる単語は、意味的な関連性が強い」など考えられる
トピックモデルは、文書から「何らかの話題(＝トピック)」を発見するための統計的なモデルのこと

トピックモデルのいろいろ

Unigram Mixtures
- ナイーブベイズでクラス数kと各パラメータをEMで繰り返し推定していく
- http://www.kamalnigam.com/papers/emcat-mlj99.pdf
Probabilistic Latent Semantic Indexing(PLSI)
- 検索技術であった潜在意味解析(LSI,1990年)を確率的に解析、開発された生成モデル(1999年)
- 各単語ごとに別なトピックから生成されたと仮定する
- http://cs.brown.edu/~th/papers/Hofmann-UAI99.pdf
Latent Dirichlet allocation(LDA)
- 階層ベイズモデルにしたもの。学習方法や応用・拡張の研究が盛ん
- http://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf
など

http://www.ai-gakkai.or.jp/my-bookmark_vol27-no3/
上記ページの下の方でもいろいろ紹介されている

LDAを試してみる

トピックモデルの基本となるLDAについて、以下のチュートリアルを参考にちょっと試してみた。
http://www.phontron.com/slides/nlp-programming-ja-07-topic.pdf

#include <iostream>
#include <vector>
#include <set>
#include <map>
#include <string>
#include <sstream>
#include <fstream>
#include <cmath>

//乱数
// 注意: longではなくint(32bit)にすべき
unsigned long xor128(){
  static unsigned long x=123456789,y=362436069,z=521288629,w=88675123;
  unsigned long t;
  t=(x^(x<<11));x=y;y=z;z=w; return( w=(w^(w>>19))^(t^(t>>8)) );
}
//[0,1)の一様乱数
// 注意: int_maxぐらいで割るべき
double frand(){
  return xor128()%10000000/static_cast<double>(10000000);
}

class LDA {
  int N_x, N_topic;
  double alpha, beta;

  std::vector< std::vector<std::string> > xcorpus;
  std::vector< std::vector<int> > ycorpus;
  std::map<std::pair<std::string,int>, int> xcounts;
  std::map< std::pair<int,int>, int> ycounts;

  void add_counts(const std::string& word, int topic, int docid, int amount){
    xcounts[std::make_pair<std::string,int>("",topic)] += amount;
    xcounts[std::make_pair<std::string,int>(word,topic)] += amount;
    
    std::stringstream ss;
    ss << docid;
    std::string sdocid = ss.str();

    ycounts[std::make_pair<int,int>(-1,docid)] += amount;
    ycounts[std::make_pair<int,int>(topic,docid)] += amount;
  }

  int sample_one(const std::vector<double>& probs){
    double z = 0.0;
    for(size_t i=0; i<probs.size(); i++){
      z += probs[i];
    }
    double remain = z * frand();
    for(size_t i=0; i<probs.size(); i++){
      remain -= probs[i];
      if(remain < 0.0) return i;
    }
    return probs.size()-1;
  }
  
  double P_x_y(const std::string x, int y){
    return (xcounts[std::make_pair<std::string,int>(x,y)] + alpha) /
      (xcounts[std::make_pair<std::string,int>("",y)] + alpha * N_x);
  }
  
  double P_y_Y(int y, int Y){
    return (ycounts[std::make_pair<int,int>(y,Y)] + beta) /
      (ycounts[std::make_pair<int,int>(-1,Y)] + beta * N_topic);
  }
  

public:
  LDA(int n_topic, double alpha_, double beta_, const std::string& file){
    N_topic = n_topic;
    alpha = alpha_;
    beta = beta_;

    std::ifstream ifs(file.c_str());
    std::string text, w;
    int docid = 0;
    std::set<std::string> w_uniq;

    while(std::getline(ifs, text)){
      std::stringstream ss(text);
      std::vector<std::string> ws;
      std::vector<int> ts;
      while(ss >> w){
        ws.push_back(w);
        w_uniq.insert(w);

        int t = xor128()%N_topic;
        ts.push_back(t);
        add_counts(w, t, docid, 1);
      }
      xcorpus.push_back(ws);
      ycorpus.push_back(ts);
      docid++;
    }

    N_x = w_uniq.size();
  }

  
  void dump(){
    for(size_t i=0; i<xcorpus.size(); i++){
      for(size_t j=0; j<xcorpus[i].size(); j++){
        std::cout << xcorpus[i][j] << "(" << ycorpus[i][j] << ") ";
      }
      std::cout << std::endl;
    }
  }

  
  void learn(int iter){
    for(int itr=0; itr<iter; itr++){
      double logs = 0.0;
      
      for(size_t i=0; i<xcorpus.size(); i++){
        for(size_t j=0; j<xcorpus[i].size(); j++){
          std::string x = xcorpus[i][j];
          int y = ycorpus[i][j];
          add_counts(x, y, i, -1);

          std::vector<double> probs;
          for(int k=0; k<N_topic; k++){
            probs.push_back( P_x_y(x, k) * P_y_Y(k, i) );
          }

          int new_y = sample_one(probs);
          logs += log(probs[new_y]);
          add_counts(x, new_y, i, 1);
          ycorpus[i][j] = new_y;

        }
      }

      std::cerr << logs << std::endl;
    }
  }

  

};


int main(){

  LDA lda(2, 0.001, 0.1, "07-train.txt");

  lda.learn(100);

  lda.dump();

  return 0;
}

結果

a(0) b(0) c(0) d(0) 
e(1) f(1) g(1) h(1) 
a(0) b(0) e(1) f(1)

期待される結果になってる。