To compute the frequency matrix, it takes a collection of 7 wsdl documents and extrats keywords form each document to write in a text named "wsdl.txt" ,which is shown below:
Book;book server amozon price book
Zip;code weather country city zip zone
Fax;fax message
SMS;sms email message
Zip;zip code city area valid zip
Weather;weather zip city valid
In this poster mainly implement TF-IDF matrix,so I only split words easily,not consider to remove stopwords,etc.
Calculate words weight and develope raw TF、TF、IDF Matrix : Matrix .java
package beaver;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeMap;
public class Matrix {
static TreeMap
static SortedSet
static String[] documnet;
public void init(String[] documnet,TreeMap
this.documnet=documnet;
this.wordNumber=wordNumber;
this.wordBank=wordBank;
}
public static TreeMap
TreeMap
for(String index:wordNumber.keySet()){
Map
TreeMap
Iterator it=wordBank.iterator();
middle=wordNumber.get(index);
while(it.hasNext()){
String s1=(String)it.next();
if(middle.containsKey(s1)){
ll.put(s1, middle.get(s1));
}
else
ll.put(s1, 0.0);
}
matrix1.put(index, ll);
}
return matrix1;
}
public static TreeMap
TreeMap
for(String id:wordNumber.keySet()){
Map
TreeMap
Iterator it=wordBank.iterator();
middle=wordNumber.get(id);
double num=0.0;
for(String ss:middle.keySet()){
num+=middle.get(ss);
}
while(it.hasNext()){
String s1=(String)it.next();
if(middle.containsKey(s1)){
ll.put(s1, middle.get(s1)/num);
}
else
ll.put(s1, 0.0);
}
tfMatrix1.put(id, ll);
}
return tfMatrix1;
}
public TreeMap
TreeMap
double documnetNumber) {
TreeMap
TreeMap
for(String s:wordBank){
ll.put(s, 0.0);
}
for(String id:wordNumber.keySet()){
Map
middle=wordNumber.get(id);
for(String s:middle.keySet()){
ll.put(s,ll.get(s)+1.0);
}
}
for(String id:tfMatrix.keySet()){
TreeMap
oo=tfMatrix.get(id);
for(String s1:oo.keySet()){
oo.put(s1,oo.get(s1)*(1 + Math.log(documnetNumber) - Math.log(ll.get(s1))));
}
idfMatrix1.put(id, oo);
}
return idfMatrix1;
}
}
Output matrix: OutputMatrix.java
package beaver;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import beaver.Matrix;
import beaver.OutputMatrix;
public class Main {
public static TreeMap
private static SortedSet
static String[] documnet=new String[500];//save all document name
static TreeMap
static TreeMap
static TreeMap
private static double documnetNumber=0.0D;
public static void main(String[] args) throws IOException {
int index=0;
BufferedReader reader = new BufferedReader(new FileReader("E:/wsdl.txt"));
String line = null;
while ((line = reader.readLine()) != null) {
index++;
documnetNumber++;
String[] docTitleParts = line.split(";");
documnet[index]=docTitleParts[0];
String sNickName="D"+index; //document name maybe the same,sNickName is the only marked
TreeMap
for(String c:docTitleParts[1].split(" ")){
wordBank.add(c);
if(wordCount.containsKey(c)){
wordCount.put(c,wordCount.get(c)+1.0); // Calculate the number of each word every document
}
else{
wordCount.put(c,1.0);
}
}
wordNumber.put(sNickName, wordCount);
}
out();
}
private static void out() {
Matrix matrix=new Matrix();
matrix.init(documnet,wordNumber, wordBank);
System.out.println("=== Raw Term Frequencies ===");
rawMatrix=matrix.rawMatrixDevelope();
OutputMatrix.outputMatrix(documnet,rawMatrix,wordBank);
System.out.println("=== Term Frequency ===");
tfMatrix=matrix.tfMatrixDevelop();
OutputMatrix.outputMatrix(documnet,tfMatrix,wordBank);
System.out.println("=== Inverse Document Frequency ===");
idfMatrix=matrix.idfMatrixDevelop(tfMatrix,documnetNumber);
OutputMatrix.outputMatrix(documnet,idfMatrix,wordBank);
}
}
Main.java
package beaver;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import beaver.Matrix;
import beaver.OutputMatrix;
public class Main {
public static TreeMap
private static SortedSet
static String[] documnet=new String[500];//save all document name
static TreeMap
static TreeMap
static TreeMap
private static double documnetNumber=0.0D;
public static void main(String[] args) throws IOException {
int index=0;
BufferedReader reader = new BufferedReader(new FileReader("E:/wsdl.txt"));
String line = null;
while ((line = reader.readLine()) != null) {
index++;
documnetNumber++;
String[] docTitleParts = line.split(";");
documnet[index]=docTitleParts[0];
String sNickName="D"+index; //document name maybe the same,sNickName is the only marked
TreeMap
for(String c:docTitleParts[1].split(" ")){
wordBank.add(c);
if(wordCount.containsKey(c)){
wordCount.put(c,wordCount.get(c)+1.0); // Calculat the number of each word every document
}
else{
wordCount.put(c,1.0);
}
}
wordNumber.put(sNickName, wordCount);
}
out();
}
private static void out() {
Matrix matrix=new Matrix();
matrix.init(documnet,wordNumber, wordBank);
System.out.println("=== Raw Term Frequencies ===");
rawMatrix=matrix.rawMatrixDevelope();
OutputMatrix.outputMatrix(documnet,rawMatrix,wordBank);
System.out.println("=== Term Frequency ===");
tfMatrix=matrix.tfMatrixDevelop();
OutputMatrix.outputMatrix(documnet,tfMatrix,wordBank);
System.out.println("=== Inverse Document Frequency ===");
idfMatrix=matrix.idfMatrixDevelop(tfMatrix,documnetNumber);
OutputMatrix.outputMatrix(documnet,idfMatrix,wordBank);
}
}
The output you can see as follow:
=== Raw Term Frequencies ===
amozon area book city code country email fax message price sms weather zip
Book 1.0000 0.0000 2.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000
Zip 0.0000 0.0000 0.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000
Fax 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000
SMS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 0.0000 1.0000 0.0000 1.0000 0.0000 0.0000
Zip 0.0000 1.0000 0.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000
Weather 0.0000 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 1.0000
=== Term Frequency ===
amozon area book city code country email fax message price sms weather zip
Book 0.2500 0.0000 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.2500 0.0000 0.0000 0.0000
Zip 0.0000 0.0000 0.0000 0.2500 0.2500 0.2500 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.2500
Fax 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.5000 0.5000 0.0000 0.0000 0.0000 0.0000
SMS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.3333 0.0000 0.3333 0.0000 0.3333 0.0000 0.0000
Zip 0.0000 0.2500 0.0000 0.2500 0.2500 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.2500
Weather 0.0000 0.0000 0.0000 0.3333 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.3333 0.3333
=== Inverse Document Frequency ===
amozon area book city code country email fax message price sms weather zip
Book 0.6979 0.0000 1.3959 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.6979 0.0000 0.0000 0.0000
Zip 0.0000 0.0000 0.0000 0.4233 0.5247 0.6979 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.4233
Fax 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.3959 1.0493 0.0000 0.0000 0.0000 0.0000
SMS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.9306 0.0000 0.6995 0.0000 0.9306 0.0000 0.0000
Zip 0.0000 0.6979 0.0000 0.4233 0.5247 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.4233
Weather 0.0000 0.0000 0.0000 0.5644 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.9306 0.5644

没有评论:
发表评论