2009年4月16日星期四

TF-IDF

Recently, I started working on wsdl documnets classification. I have read a lot of articles,which mention all words are weighted with TF-IDF metric.In this way,each document is mapped onto a vector.The whole document set is encoded in a matrix,where rows represent documents and columns are the weighted words.

To compute the frequency matrix, it takes a collection of 7 wsdl documents and extrats keywords form each document to write in a text named "wsdl.txt" ,which is shown below:

Book;book server amozon price book
Zip;code weather country city zip zone
Fax;fax message
SMS;sms email message
Zip;zip code city area valid zip
Weather;weather zip city valid

In this poster mainly implement TF-IDF matrix,so I only split words easily,not consider to remove stopwords,etc.

Calculate words weight and develope raw TF、TF、IDF Matrix : Matrix .java

package beaver;

import java.util.Iterator;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeMap;
public class Matrix {

static TreeMap> wordNumber;
static SortedSet wordBank;
static String[] documnet;

public void init(String[] documnet,TreeMap> wordNumber,SortedSet wordBank){
this.documnet=documnet;
this.wordNumber=wordNumber;
this.wordBank=wordBank;
}

public static TreeMap> rawMatrixDevelope(){
TreeMap> matrix1=new TreeMap>();
for(String index:wordNumber.keySet()){
Map middle=new TreeMap();
TreeMap ll=new TreeMap();
Iterator it=wordBank.iterator();
middle=wordNumber.get(index);
while(it.hasNext()){
String s1=(String)it.next();
if(middle.containsKey(s1)){
ll.put(s1, middle.get(s1));
}
else
ll.put(s1, 0.0);
}
matrix1.put(index, ll);
}
return matrix1;
}

public static TreeMap> tfMatrixDevelop(){
TreeMap> tfMatrix1=new TreeMap>();
for(String id:wordNumber.keySet()){
Map middle=new TreeMap();
TreeMap ll=new TreeMap();
Iterator it=wordBank.iterator();
middle=wordNumber.get(id);
double num=0.0;
for(String ss:middle.keySet()){
num+=middle.get(ss);
}
while(it.hasNext()){
String s1=(String)it.next();
if(middle.containsKey(s1)){
ll.put(s1, middle.get(s1)/num);
}
else
ll.put(s1, 0.0);
}
tfMatrix1.put(id, ll);
}
return tfMatrix1;
}

public TreeMap> idfMatrixDevelop(
TreeMap> tfMatrix,
double documnetNumber) {

TreeMap> idfMatrix1=new TreeMap>();
TreeMap ll=new TreeMap();
for(String s:wordBank){
ll.put(s, 0.0);
}
for(String id:wordNumber.keySet()){
Map middle=new TreeMap();
middle=wordNumber.get(id);
for(String s:middle.keySet()){
ll.put(s,ll.get(s)+1.0);
}
}
for(String id:tfMatrix.keySet()){
TreeMap oo=new TreeMap();
oo=tfMatrix.get(id);

for(String s1:oo.keySet()){
oo.put(s1,oo.get(s1)*(1 + Math.log(documnetNumber) - Math.log(ll.get(s1))));
}
idfMatrix1.put(id, oo);
}
return idfMatrix1;
}
}

Output matrix: OutputMatrix.java


package beaver;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import beaver.Matrix;
import beaver.OutputMatrix;


public class Main {

public static TreeMap> wordNumber=new TreeMap>();
private static SortedSet wordBank=new TreeSet();//save all unique words
static String[] documnet=new String[500];//save all document name
static TreeMap> rawMatrix;
static TreeMap> tfMatrix;
static TreeMap> idfMatrix;
private static double documnetNumber=0.0D;

public static void main(String[] args) throws IOException {
int index=0;
BufferedReader reader = new BufferedReader(new FileReader("E:/wsdl.txt"));
String line = null;
while ((line = reader.readLine()) != null) {
index++;
documnetNumber++;
String[] docTitleParts = line.split(";");
documnet[index]=docTitleParts[0];
String sNickName="D"+index; //document name maybe the same,sNickName is the only marked
TreeMap wordCount = new TreeMap();
for(String c:docTitleParts[1].split(" ")){
wordBank.add(c);
if(wordCount.containsKey(c)){
wordCount.put(c,wordCount.get(c)+1.0); // Calculate the number of each word every document
}
else{
wordCount.put(c,1.0);
}
}
wordNumber.put(sNickName, wordCount);
}
out();
}

private static void out() {

Matrix matrix=new Matrix();
matrix.init(documnet,wordNumber, wordBank);

System.out.println("=== Raw Term Frequencies ===");
rawMatrix=matrix.rawMatrixDevelope();
OutputMatrix.outputMatrix(documnet,rawMatrix,wordBank);

System.out.println("=== Term Frequency ===");
tfMatrix=matrix.tfMatrixDevelop();
OutputMatrix.outputMatrix(documnet,tfMatrix,wordBank);

System.out.println("=== Inverse Document Frequency ===");
idfMatrix=matrix.idfMatrixDevelop(tfMatrix,documnetNumber);
OutputMatrix.outputMatrix(documnet,idfMatrix,wordBank);
}
}

Main.java

package beaver;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import beaver.Matrix;
import beaver.OutputMatrix;


public class Main {

public static TreeMap> wordNumber=new TreeMap>();
private static SortedSet wordBank=new TreeSet();//save all unique words
static String[] documnet=new String[500];//save all document name
static TreeMap> rawMatrix;
static TreeMap> tfMatrix;
static TreeMap> idfMatrix;
private static double documnetNumber=0.0D;

public static void main(String[] args) throws IOException {
int index=0;
BufferedReader reader = new BufferedReader(new FileReader("E:/wsdl.txt"));
String line = null;
while ((line = reader.readLine()) != null) {
index++;
documnetNumber++;
String[] docTitleParts = line.split(";");
documnet[index]=docTitleParts[0];
String sNickName="D"+index; //document name maybe the same,sNickName is the only marked
TreeMap wordCount = new TreeMap();
for(String c:docTitleParts[1].split(" ")){
wordBank.add(c);
if(wordCount.containsKey(c)){
wordCount.put(c,wordCount.get(c)+1.0); // Calculat the number of each word every document
}
else{
wordCount.put(c,1.0);
}
}
wordNumber.put(sNickName, wordCount);
}
out();
}

private static void out() {

Matrix matrix=new Matrix();
matrix.init(documnet,wordNumber, wordBank);

System.out.println("=== Raw Term Frequencies ===");
rawMatrix=matrix.rawMatrixDevelope();
OutputMatrix.outputMatrix(documnet,rawMatrix,wordBank);

System.out.println("=== Term Frequency ===");
tfMatrix=matrix.tfMatrixDevelop();
OutputMatrix.outputMatrix(documnet,tfMatrix,wordBank);

System.out.println("=== Inverse Document Frequency ===");
idfMatrix=matrix.idfMatrixDevelop(tfMatrix,documnetNumber);
OutputMatrix.outputMatrix(documnet,idfMatrix,wordBank);
}
}

The output you can see as follow:

=== Raw Term Frequencies ===
amozon area book city code country email fax message price sms weather zip
Book 1.0000 0.0000 2.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000

Zip 0.0000 0.0000 0.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000
Fax 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000
SMS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 0.0000 1.0000 0.0000 1.0000 0.0000 0.0000
Zip 0.0000 1.0000 0.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000
Weather 0.0000 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 1.0000

=== Term Frequency ===

amozon area book city code country email fax message price sms weather zip
Book 0.2500 0.0000 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.2500 0.0000 0.0000 0.0000
Zip 0.0000 0.0000 0.0000 0.2500 0.2500 0.2500 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.2500
Fax 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.5000 0.5000 0.0000 0.0000 0.0000 0.0000
SMS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.3333 0.0000 0.3333 0.0000 0.3333 0.0000 0.0000
Zip 0.0000 0.2500 0.0000 0.2500 0.2500 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.2500
Weather 0.0000 0.0000 0.0000 0.3333 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.3333 0.3333


=== Inverse Document Frequency ===
amozon area book city code country email fax message price sms weather zip
Book 0.6979 0.0000 1.3959 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.6979 0.0000 0.0000 0.0000

Zip 0.0000 0.0000 0.0000 0.4233 0.5247 0.6979 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.4233
Fax 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.3959 1.0493 0.0000 0.0000 0.0000 0.0000
SMS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.9306 0.0000 0.6995 0.0000 0.9306 0.0000 0.0000
Zip 0.0000 0.6979 0.0000 0.4233 0.5247 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.4233
Weather 0.0000 0.0000 0.0000 0.5644 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.9306 0.5644




2009年4月12日星期日

Parse XML file

Recently I have writen a XML file,which contains some information about database config,such as driveName,dbURL,userName,userPwd.Then by parsing it to get database-connected information.Therefore you can change the xml file directly to meet your special database neatly,needn't to chang your program.

I use mysql datadase.

database-config.xml

Use jdom API to parse database-config.xml to get driveName,dbURL,userName,userPwd.

configReader.java


package beaver;

import java.io.IOException;

import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;

public class configReader {
private String driverName;
private String dbURL;
private String userName;
private String userPwd;

public Element getRoot(String fileName) {
Document doc=null;
//create parser object
SAXBuilder sax=new SAXBuilder();
try{
//parser XML doc,build a node tree,and return root
doc=sax.build(fileName);
}catch(JDOMException e){
e.printStackTrace();
}catch(IOException e){
e.printStackTrace();
}
Element root=doc.getRootElement();
return root;
}
public void getDBInformation(Element root) throws Exception {
Element configInfo=root.getChild("config-info");
driverName=configInfo.getChild("driverName").getText();
dbURL=configInfo.getChild("dbURL").getText();
userName=configInfo.getChild("userName").getText();
userPwd=configInfo.getChild("userPwd").getText();
if(driverName==""){
throw new Exception("Database Driver Name is not set.");
}
if(dbURL==""){
throw new Exception("Database URL String is not set.");
}
if(userName==""){
throw new Exception("Database User Name is not set.");
}
if (userPwd==""){
throw new Exception("Database Password is not set.");
}
}
public static void main(String[] args) throws Exception {

configReader parser=new configReader();
parser.getDBInformation(parser.getRoot("E:/database-config.xml"));

}

public String getDriverName() {
return driverName;
}
public String getDbURL() {
return dbURL;
}
public String getUserName() {
return userName;
}
public String getUserPwd() {
return userPwd;
}
}

2009年3月29日星期日

JDBC

JDBC——Implemente Database Connection
Interface Database

//source:
package beaver;

import java.sql.Connection;
import java.sql.SQLException;

public interface Database {
/**
* Returns a database connection.
* The database connection that is returned is always the same.
* @return A Database Connection
*/
public abstract Connection getConnection();
/**
* Returns an independent connection.
* Returns a new database connection for each call
* @return A Database Connection
*/
public abstract Connection getNewConnection() throws SQLException;

public abstract String getDatabaseName();

public abstract String getDriverName();

public abstract String getPassword();

public abstract String getUserName();

}

GenericDatabase .java

//source
package beaver;

import java.sql.*;

public class GenericDatabase implements Database {

private String driverName;
private String databaseName;
private String userName;
private String password;
private Connection con;
/**
* @throws ClassNotFoundException
* @throws SQLException
*/
public GenericDatabase(
String _driverName,
String _databaseName,
String _userName,
String _password)
throws ClassNotFoundException, SQLException {
driverName = _driverName;
databaseName = _databaseName;
userName = _userName;
password = _password;
try{
Class.forName(driverName);
con = DriverManager.getConnection(
databaseName,
userName,
password);
}catch(Exception e){
System.out.println("fail");
System.out.println(e.getMessage());
}
}
public String getDatabaseName() {
return databaseName;
}

public String getDriverName() {
return driverName;
}

public String getPassword() {
return password;
}

public String getUserName() {
return userName;
}

/**
* Returns a database connection.
* The database connection that is returned is always the same.
* @return A Database Connection
*/
public Connection getConnection() {
return con;
}

/**
* Returns an independent connection.
* Returns a new database connection for each call
* This method retries until it actually gets a connection
* @return A Database Connection
*/
public Connection getNewConnection() {
Connection ncon;
while (true) {
try {
ncon =
DriverManager.getConnection(getDatabaseName(),
getUserName(),
getPassword());
return ncon;
} catch (SQLException e) {
System.err.println(e.getErrorCode() + " " + e.getMessage());
try {
// Wait for half a sec, then try again
Thread.sleep(500);
} catch (InterruptedException ie) {
// ignore
}
}
}
}
}

myDatabase .java
Connect database.If connect local database,ip is "localhost" or "127.0.0.1"
"first" is a database;"root" is usename;"11111" is password.

//source:
package beaver;

import java.sql.*;

public class myDatabase extends GenericDatabase {

public myDatabase ()
throws ClassNotFoundException, SQLException {
super("com.mysql.jdbc.Driver",
"jdbc:mysql://ip:3306/first",
"root",
"11111");
}
}

JavaSyntax

Recently I use some JavaSyntax,as follow:

1)Formatter

import java.util.Formatter;

Formatter f=new Formatter(System.out);
//output a string s,give it 40 char size and
flush left
f.format("%-40s",s);
//output a double d,give it 40 char size and
save 4 decimal and default flush right
f.format("%-40.4f", d);

2)create a .arff file and output data to file

import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

File f = new File("d:/" + "weather" + ".arff");
DataOutputStream out=
new DataOutputStream(
new FileOutputStream(f));
out.writeBytes("@relation weather");
//remove blanks among the string
String s="hello world";
out.writeBytes(s.replaceAll(" ", ""));

3)
save 4 decimal

import java.text.DecimalFormat;

DecimalFormat digits=new DecimalFormat("0.0000");
double value=d;
out.writeBytes(digits.format(value));

2009年3月14日星期六

first

Some days ago,I see sujitpal's blog,I really like it and admin him.His articles are uesful to me.Therefore I want to create a blog to write myself.I am a postgraduate now,After 2 years,I will graduate from school.All the time I have a faith--Be myself,by myself,realize my dreams.I deeply understand if I want to success I must do my best and by myself,because only can I help myself.I want to be a programmer but I am poor in programming.So I learn some necessary knowledge and technology,but sometimes I feel helpless,even lost direction,but nobody can talk with.Maybe I have little knowledge and need learn more fast.I think I must pay more and more if I want to realize my dreams.The most important is persistence.From now on,I will write something in my blog every day,I believe I can become a useful person.Frighting!