Be myself，by myself，realize my dreams!: TF-IDF

Recently, I started working on wsdl documnets classification. I have read a lot of articles,which mention all words are weighted with TF-IDF metric.In this way,each document is mapped onto a vector.The whole document set is encoded in a matrix,where rows represent documents and columns are the weighted words.

To compute the frequency matrix, it takes a collection of 7 wsdl documents and extrats keywords form each document to write in a text named "wsdl.txt" ,which is shown below:

Book;book server amozon price book
Zip;code weather country city zip zone
Fax;fax message
SMS;sms email message
Zip;zip code city area valid zip
Weather;weather zip city valid

In this poster mainly implement TF-IDF matrix,so I only split words easily,not consider to remove stopwords,etc.

Calculate words weight and develope raw TF、TF、IDF Matrix : Matrix .java

package beaver;

import java.util.Iterator;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeMap;
public class Matrix {

static TreeMap> wordNumber;
static SortedSet wordBank;
static String[] documnet;

public void init(String[] documnet,TreeMap> wordNumber,SortedSet wordBank){
this.documnet=documnet;
this.wordNumber=wordNumber;
this.wordBank=wordBank;
}

public static TreeMap> rawMatrixDevelope(){
TreeMap> matrix1=new TreeMap>();
for(String index:wordNumber.keySet()){
Map middle=new TreeMap();
TreeMap ll=new TreeMap();
Iterator it=wordBank.iterator();
middle=wordNumber.get(index);
while(it.hasNext()){
String s1=(String)it.next();
if(middle.containsKey(s1)){
ll.put(s1, middle.get(s1));
}
else
ll.put(s1, 0.0);
}
matrix1.put(index, ll);
}
return matrix1;
}

public static TreeMap> tfMatrixDevelop(){
TreeMap> tfMatrix1=new TreeMap>();
for(String id:wordNumber.keySet()){
Map middle=new TreeMap();
TreeMap ll=new TreeMap();
Iterator it=wordBank.iterator();
middle=wordNumber.get(id);
double num=0.0;
for(String ss:middle.keySet()){
num+=middle.get(ss);
}
while(it.hasNext()){
String s1=(String)it.next();
if(middle.containsKey(s1)){
ll.put(s1, middle.get(s1)/num);
}
else
ll.put(s1, 0.0);
}
tfMatrix1.put(id, ll);
}
return tfMatrix1;
}

public TreeMap> idfMatrixDevelop(
TreeMap> tfMatrix,
double documnetNumber) {

TreeMap> idfMatrix1=new TreeMap>();
TreeMap ll=new TreeMap();
for(String s:wordBank){
ll.put(s, 0.0);
}
for(String id:wordNumber.keySet()){
Map middle=new TreeMap();
middle=wordNumber.get(id);
for(String s:middle.keySet()){
ll.put(s,ll.get(s)+1.0);
}
}
for(String id:tfMatrix.keySet()){
TreeMap oo=new TreeMap();
oo=tfMatrix.get(id);

for(String s1:oo.keySet()){
oo.put(s1,oo.get(s1)*(1 + Math.log(documnetNumber) - Math.log(ll.get(s1))));
}
idfMatrix1.put(id, oo);
}
return idfMatrix1;
}
}

Output matrix: OutputMatrix.java

package beaver;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import beaver.Matrix;
import beaver.OutputMatrix;

public class Main {

public static TreeMap> wordNumber=new TreeMap>();
private static SortedSet wordBank=new TreeSet();//save all unique words
static String[] documnet=new String[500];//save all document name
static TreeMap> rawMatrix;
static TreeMap> tfMatrix;
static TreeMap> idfMatrix;
private static double documnetNumber=0.0D;

public static void main(String[] args) throws IOException {
int index=0;
BufferedReader reader = new BufferedReader(new FileReader("E:/wsdl.txt"));
String line = null;
while ((line = reader.readLine()) != null) {
index++;
documnetNumber++;
String[] docTitleParts = line.split(";");
documnet[index]=docTitleParts[0];
String sNickName="D"+index; //document name maybe the same,sNickName is the only marked
TreeMap wordCount = new TreeMap();
for(String c:docTitleParts[1].split(" ")){
wordBank.add(c);
if(wordCount.containsKey(c)){
wordCount.put(c,wordCount.get(c)+1.0); // Calculate the number of each word every document
}
else{
wordCount.put(c,1.0);
}
}
wordNumber.put(sNickName, wordCount);
}
out();
}

private static void out() {

Matrix matrix=new Matrix();
matrix.init(documnet,wordNumber, wordBank);

System.out.println("=== Raw Term Frequencies ===");
rawMatrix=matrix.rawMatrixDevelope();
OutputMatrix.outputMatrix(documnet,rawMatrix,wordBank);

System.out.println("=== Term Frequency ===");
tfMatrix=matrix.tfMatrixDevelop();
OutputMatrix.outputMatrix(documnet,tfMatrix,wordBank);

System.out.println("=== Inverse Document Frequency ===");
idfMatrix=matrix.idfMatrixDevelop(tfMatrix,documnetNumber);
OutputMatrix.outputMatrix(documnet,idfMatrix,wordBank);
}
}

Main.java

package beaver;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import beaver.Matrix;
import beaver.OutputMatrix;

public class Main {

public static TreeMap> wordNumber=new TreeMap>();
private static SortedSet wordBank=new TreeSet();//save all unique words
static String[] documnet=new String[500];//save all document name
static TreeMap> rawMatrix;
static TreeMap> tfMatrix;
static TreeMap> idfMatrix;
private static double documnetNumber=0.0D;

public static void main(String[] args) throws IOException {
int index=0;
BufferedReader reader = new BufferedReader(new FileReader("E:/wsdl.txt"));
String line = null;
while ((line = reader.readLine()) != null) {
index++;
documnetNumber++;
String[] docTitleParts = line.split(";");
documnet[index]=docTitleParts[0];
String sNickName="D"+index; //document name maybe the same,sNickName is the only marked
TreeMap wordCount = new TreeMap();
for(String c:docTitleParts[1].split(" ")){
wordBank.add(c);
if(wordCount.containsKey(c)){
wordCount.put(c,wordCount.get(c)+1.0); // Calculat the number of each word every document
}
else{
wordCount.put(c,1.0);
}
}
wordNumber.put(sNickName, wordCount);
}
out();
}

private static void out() {

Matrix matrix=new Matrix();
matrix.init(documnet,wordNumber, wordBank);

System.out.println("=== Raw Term Frequencies ===");
rawMatrix=matrix.rawMatrixDevelope();
OutputMatrix.outputMatrix(documnet,rawMatrix,wordBank);

System.out.println("=== Term Frequency ===");
tfMatrix=matrix.tfMatrixDevelop();
OutputMatrix.outputMatrix(documnet,tfMatrix,wordBank);

System.out.println("=== Inverse Document Frequency ===");
idfMatrix=matrix.idfMatrixDevelop(tfMatrix,documnetNumber);
OutputMatrix.outputMatrix(documnet,idfMatrix,wordBank);
}
}

The output you can see as follow:

=== Raw Term Frequencies ===
amozon area book city code country email fax message price sms weather zip
Book 1.0000 0.0000 2.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000
Zip 0.0000 0.0000 0.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000
Fax 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000
SMS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 0.0000 1.0000 0.0000 1.0000 0.0000 0.0000
Zip 0.0000 1.0000 0.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000
Weather 0.0000 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 1.0000

=== Term Frequency ===
amozon area book city code country email fax message price sms weather zip
Book 0.2500 0.0000 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.2500 0.0000 0.0000 0.0000
Zip 0.0000 0.0000 0.0000 0.2500 0.2500 0.2500 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.2500
Fax 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.5000 0.5000 0.0000 0.0000 0.0000 0.0000
SMS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.3333 0.0000 0.3333 0.0000 0.3333 0.0000 0.0000
Zip 0.0000 0.2500 0.0000 0.2500 0.2500 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.2500
Weather 0.0000 0.0000 0.0000 0.3333 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.3333 0.3333

=== Inverse Document Frequency ===
amozon area book city code country email fax message price sms weather zip
Book 0.6979 0.0000 1.3959 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.6979 0.0000 0.0000 0.0000
Zip 0.0000 0.0000 0.0000 0.4233 0.5247 0.6979 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.4233
Fax 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.3959 1.0493 0.0000 0.0000 0.0000 0.0000
SMS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.9306 0.0000 0.6995 0.0000 0.9306 0.0000 0.0000
Zip 0.0000 0.6979 0.0000 0.4233 0.5247 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.4233
Weather 0.0000 0.0000 0.0000 0.5644 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.9306 0.5644

Be myself，by myself，realize my dreams!

2009年4月16日星期四

TF-IDF

没有评论:

发表评论

Websites

博客归档

我的简介