测试lucene的所有分词接口

 

Lucene本身提供了几个分词接口,我后来有给写了一个分词接口.

功能递增如下:

数据挖掘实验室

WhitespaceAnalyzer:仅仅是去除空格,对字符没有lowcase化,不支持中文

SimpleAnalyzer:功能强于WhitespaceAnalyzer,将除去letter之外的符号全部过滤掉,并且将所有的字符lowcase化,不支持中文

数据挖掘工具

StopAnalyzer:StopAnalyzer的功能超越了SimpleAnalyzer,在SimpleAnalyzer的基础上
    增加了去除StopWords的功能,不支持中文
数据挖掘研究院

StandardAnalyzer:英文的处理能力同于StopAnalyzer.支持中文采用的方法为单字切分. 数据挖掘交友

ChineseAnalyzer:来自于Lucene的sand box.性能类似于StandardAnalyzer,缺点是不支持中英文混和分词.

数据挖掘实验室

CJKAnalyzer:chedong写的CJKAnalyzer的功能在英文处理上的功能和StandardAnalyzer相同
    但是在汉语的分词上,不能过滤掉标点符号,即使用二元切分
数据挖掘工具

TjuChineseAnalyzer:我写的,功能最为强大.TjuChineseAnlyzer的功能相当强大,在中文分词方面由于其调用的为ICTCLAS的java接口.所以其在中文方面性能上同与ICTCLAS.其在英文分词上采用了Lucene的 StopAnalyzer,可以去除 stopWords,而且可以不区分大小写,过滤掉各类标点符号.

数据挖掘交友

程序调试于:JBuilder 2005 数据挖掘论坛

package org.apache.lucene.analysis; 数据挖掘交友

//Author:zhangbufeng
//TjuAILab(天津大学人工智能实验室)
//2005.9.22.11:00


import java.io.*;
import junit.framework.*;
数据挖掘工具

import org.apache.lucene.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.cn.*;
import org.apache.lucene.analysis.cjk.*;
import org.apache.lucene.analysis.tjucn.*;
import com.xjt.nlp.word.*;
public class TestAnalyzers extends TestCase {

   public TestAnalyzers(String name) {
      super(name);
   }

数据挖掘工具

  public void assertAnalyzesTo(Analyzer a,
                               String input,
                               String[] output) throws Exception {
    //前面的"dummy"好像没有用到
    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    StringReader readerInput=new StringReader(input);
    for (int i=0; i      Token t = ts.next();
      //System.out.println(t);
      assertNotNull(t);
      //使用下面这条语句即可以输出Token的每项的text,并且用空格分开
      System.out.print(t.termText);

数据挖掘工具


      System.out.print(" ");
      assertEquals(t.termText(), output[i]);
    }
    System.out.println(" ");
    assertNull(ts.next());
    ts.close();
  }
 public void outputAnalyzer(Analyzer a ,String input) throws Exception{
   TokenStream ts = a.tokenStream("dummy",new StringReader(input));
   StringReader readerInput = new StringReader(input);
   while(true){
     Token t = ts.next();
     if(t!=null){
       System.out.print(t.termText);
       System.out.print(" ");
     }
     else
     break;

数据挖掘交友

   }
 System.out.println(" ");
 ts.close();
 }

数据挖掘工具

  public void testSimpleAnalyzer() throws Exception {
    //学习使用SimpleAnalyzer();
    //SimpleAnalyzer将除去letter之外的符号全部过滤掉,并且将所有的字符lowcase化
    Analyzer a = new SimpleAnalyzer();
    assertAnalyzesTo(a, "foo bar FOO BAR",
                     new String[] { "foo", "bar", "foo", "bar" });
    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR",
                     new String[] { "foo", "bar", "foo", "bar" });
    assertAnalyzesTo(a, "foo.bar.FOO.BAR",
                     new String[] { "foo", "bar", "foo", "bar" }); 数据挖掘论坛
    assertAnalyzesTo(a, "U.S.A.",
                     new String[] { "u", "s", "a" });
    assertAnalyzesTo(a, "C++",
                     new String[] { "c" });
    assertAnalyzesTo(a, "B2B",
                     new String[] { "b", "b" });
    assertAnalyzesTo(a, "2B",
                     new String[] { "b" });
    assertAnalyzesTo(a, ""QUOTED" word",
                     new String[] { "quoted", "word" }); 数据挖掘论坛
    assertAnalyzesTo(a,"zhang ./ bu <> feng",
                     new String[]{"zhang","bu","feng"});
    ICTCLAS splitWord = new ICTCLAS();
    String result = splitWord.paragraphProcess("我爱共产党 i LOVE chanchan");
    assertAnalyzesTo(a,result,
                     new String[]{"我","爱","共产党","i","love","chanchan"});

数据挖掘研究院

  } 数据挖掘工具

  public void testWhiteSpaceAnalyzer() throws Exception {
    //WhiterspaceAnalyzer仅仅是去除空格,对字符没有lowcase化
    Analyzer a = new WhitespaceAnalyzer();
    assertAnalyzesTo(a, "foo bar FOO BAR",
                     new String[] { "foo", "bar", "FOO", "BAR" });
    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR",
                     new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
    assertAnalyzesTo(a, "foo.bar.FOO.BAR",
                     new String[] { "foo.bar.FOO.BAR" });
    assertAnalyzesTo(a, "U.S.A.",


                     new String[] { "U.S.A." });
    assertAnalyzesTo(a, "C++",
                     new String[] { "C++" });
数据挖掘论坛

    assertAnalyzesTo(a, "B2B",
                     new String[] { "B2B" });
    assertAnalyzesTo(a, "2B",
                     new String[] { "2B" });
    assertAnalyzesTo(a, ""QUOTED" word",
                     new String[] { ""QUOTED"", "word" });
数据挖掘交友

    assertAnalyzesTo(a,"zhang bu feng",
                     new String []{"zhang","bu","feng"});
    ICTCLAS splitWord = new ICTCLAS();
    String result = splitWord.paragraphProcess("我爱共产党 i love chanchan");
    assertAnalyzesTo(a,result,
                     new String[]{"我","爱","共产党","i","love","chanchan"});
  }

  public void testStopAnalyzer() throws Exception {
    //StopAnalyzer的功能超越了SimpleAnalyzer,在SimpleAnalyzer的基础上
    //增加了去除StopWords的功能
   Analyzer a = new StopAnalyzer();
    assertAnalyzesTo(a, "foo bar FOO BAR",
                     new String[] { "foo", "bar", "foo", "bar" });
    assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
                     new String[] { "foo", "bar", "foo", "bar" });
    assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
                     new String[]{"foo","bar","foo","bar"});
    ICTCLAS splitWord = new ICTCLAS();


    String result = splitWord.paragraphProcess("我爱共产党 i Love chanchan such");
    assertAnalyzesTo(a,result,
                     new String[]{"我","爱","共产党","i","love","chanchan"});
数据挖掘实验室

  }
  public void testStandardAnalyzer() throws Exception{
  //StandardAnalyzer的功能最为强大,对于中文采用的为单字切分
  Analyzer a = new StandardAnalyzer();
  assertAnalyzesTo(a,"foo bar Foo Bar",
                   new String[]{"foo","bar","foo","bar"});
  assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR",
                   new String[]{"foo","bar","foo","bar"});
  assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
                     new String[]{"foo","bar","foo","bar"});
  assertAnalyzesTo(a,"张步峰是天大学生",
                   new String[]{"张","步","峰","是","天","大","学","生"});


  //验证去除英文的标点符号
  assertAnalyzesTo(a,"张,/步/,峰,.是.,天大<>学生",
                   new String[]{"张","步","峰","是","天","大","学","生"});
  //验证去除中文的标点符号
  assertAnalyzesTo(a,"张。、步。、峰是。天大。学生",
                   new String[]{"张","步","峰","是","天","大","学","生"});
  }
  public void testChineseAnalyzer() throws Exception{
  //可见ChineseAnalyzer在功能上和standardAnalyzer的功能差不多,但是可能在速度上慢于StandardAnalyzer
  Analyzer a = new ChineseAnalyzer();
数据挖掘工具

  //去空格
  assertAnalyzesTo(a,"foo bar Foo Bar",
                    new String[]{"foo","bar","foo","bar"});
   assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR",
                    new String[]{"foo","bar","foo","bar"});
   assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
                      new String[]{"foo","bar","foo","bar"});
   assertAnalyzesTo(a,"张步峰是天大学生",
                    new String[]{"张","步","峰","是","天","大","学","生"});
   //验证去除英文的标点符号
   assertAnalyzesTo(a,"张,/步/,峰,.是.,天大<>学生",


                    new String[]{"张","步","峰","是","天","大","学","生"});
   //验证去除中文的标点符号
   assertAnalyzesTo(a,"张。、步。、峰是。天大。学生",
                    new String[]{"张","步","峰","是","天","大","学","生"});
   //不支持中英文写在一起
  // assertAnalyzesTo(a,"我爱你 i love chanchan",
  ///                  new String[]{"我","爱","你","i","love","chanchan"});
数据挖掘研究院

  }
  public void testCJKAnalyzer() throws Exception {
    //chedong写的CJKAnalyzer的功能在英文处理上的功能和StandardAnalyzer相同
    //但是在汉语的分词上,不能过滤掉标点符号,即使用二元切分
    Analyzer a = new CJKAnalyzer();
    assertAnalyzesTo(a,"foo bar Foo Bar",
                    new String[]{"foo","bar","foo","bar"});
    assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR",
                  new String[]{"foo","bar","foo","bar"});
    assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ",
                    new String[]{"foo","bar","foo","bar"});

   // assertAnalyzesTo(a,"张,/步/,峰,.是.,天大<>学生",
     //                new String[]{"张步","步峰","峰是","是天","天大","大学","学生"});
    //assertAnalyzesTo(a,"张。、步。、峰是。天大。学生",
     //                new String[]{"张步","步峰","峰是","是天","天大","大学","学生"});
   //支持中英文同时写
    assertAnalyzesTo(a,"张步峰是天大学生 i love",
                     new String[]{"张步","步峰","峰是","是天","天大","大学","学生","i","love"});

  }
  public void testTjuChineseAnalyzer() throws Exception{
      /**
       * TjuChineseAnlyzer的功能相当强大,在中文分词方面由于其调用的为ICTCLAS的java接口.
       * 所以其在中文方面性能上同与ICTCLAS.其在英文分词上采用了Lucene的StopAnalyzer,可以去除
       * stopWords,而且可以不区分大小写,过滤掉各类标点符号.
       */
      Analyzer a = new TjuChineseAnalyzer();
      String input = "体育讯 在被尤文淘汰之后,皇马主帅博斯克拒绝接受媒体对球队后防线的批评,同时还为自己排出的首发阵容进行了辩护。"+
          "“失利是全队的责任,而不仅仅是后防线该受指责,”博斯克说,“我并不认为我们踢得一塌糊涂。”“我们进入了半决赛,而且在晋级的道路上一路奋 "+
         "战。即使是今天的比赛我们也有几个翻身的机会,但我们面对的对手非常强大,他们踢得非常好。”“我们的球迷应该为过去几个赛季里我们在冠军杯中的表现感到骄傲。”"+


         "博斯克还说。对于博斯克在首发中排出了久疏战阵的坎比亚索,赛后有记者提出了质疑,认为完全应该将队内的另一 "+
         "名球员帕文派遣上场以加强后卫线。对于这一疑议,博斯克拒绝承担所谓的“责任”,认为球队的首发没有问题。“我们按照整个赛季以来的方式做了,"+
         "对于人员上的变化我没有什么可说的。”对于球队在本赛季的前景,博斯克表示皇马还有西甲联赛的冠军作为目标。“皇家马德里在冠军 "+
        "杯中战斗到了最后,我们在联赛中也将这么做。”"+
        "A Java User Group is a group of people who share a common interest in Java technology and meet on a regular basis to share"+
       " technical ideas and information. The actual structure of a JUG can vary greatly - from a small number of friends and coworkers"+
      " meeting informally in the evening, to a large group of companies based in the same geographic area. "+

数据挖掘交友


      "Regardless of the size and focus of a particular JUG, the sense of community spirit remains the same. ";
数据挖掘实验室

      outputAnalyzer(a,input);
    //此处我已经对大文本进行过测试,不会有问题效果很好
    outputAnalyzer(a,"我爱共产党 ,,。 I love China 我喜欢唱歌 ");
    assertAnalyzesTo(a,"我爱共产党 ,,。I love China 我喜欢唱歌",
                   new String[]{"爱","共产党","i","love","china","喜欢","唱歌"});
  }
}
数据挖掘工具

[数据挖掘专家] [数据挖掘研究院] [数据挖掘论坛] [数据挖掘实验室]
上一篇:基于逆向最大化词表中文分词法
下一篇:中文搜索引擎技术揭密:中文分词
最新评论共有 0 位网友发表了评论 , 查看所有评论
发表评论( 不能超过250字,需审核,请自觉遵守互联网相关政策法规。 )
匿名?
数据挖掘网站导航 数据挖掘论坛导航
  • 数据挖掘工具
  • 数据挖掘论坛
  • DataCruncher - Cognos
  • MineSet - MathSoft
  • Intelligent Miner - GainSmarts
  • Sqlserver - SAS - Clementine
  • CART - Weka - WizSoft
  • NeuroShell - ModelQuest
  • data mining tools - Darwin
  • 数据挖掘交友
  • 数据挖掘博客
  • 数据挖掘工具
  • 数据挖掘资源
  • 数据挖掘技术算法
  • 数据挖掘相关期刊、会议
  • 研究院联盟合作专区
  • 数据挖掘基础与相关技术
  • 数据挖掘厂商与就业
  • 数据挖掘研究者乐园
  • 知名厂商数据挖掘工具资料
  • 国内数据挖掘实验室
  • Foreign Data Mining Lab
  • 热点关注
  • Mercator: A Scalable, Extensible Web Cra
  • 什么是垂直搜索引擎(之二)
  • Writing a web crawler
  • 互联网搜索的未来
  • 国家版权局版权司副司长许超:关于搜索引擎
  • 百度数分钟内闪电裁员 企业软件事业部遭抛
  • 我对垂直搜索引擎的几点认识
  • Google Patent Filings by the Dozen
  • Manageability - Open Source Web Crawlers
  • 微软卡位第三代搜索技术 认为Google将很快
  • 论坛最新话题
  • Foundations of Statistical Natural Langu
  • Game Theory meet Data Mining: A Recent P
  • System Building: How does it help or hin
  • 数据挖掘与Clementine培训
  • 新手报到
  • 求 SASEM 客户流失预测分析
  • 数据挖掘工程师/搜索研究院—北京——无线
  • 数据挖掘入门介绍(如何着手数据挖掘)
  • Information Overload Survey Results
  • The INEX 2005 Workshop on Element Retrie
  • 相关资讯
  • 谷歌宣布进军可替代能源 计划投资4.4万亿美
  • 搜索大战成Web 2.0操作系统之争
  • 7月美国搜索市场环比增长2% 雅虎微软成输家
  • 网页面向搜索引擎的搜索引擎优化
  • 史上最具技术创新的10大搜索引擎
  • Google如何预测下一届美国总统
  • 微软1亿美元收购语义搜索引擎Powerset
  • 很黄很暴力:人肉搜索引擎
  • OpenSocial只不过是Google公关骗局
  • 数据之美 百度GOOGLE统计的秘密
  • 数据挖掘实验室资料
  • 数据挖掘博客地址
  • 数据挖掘实验室网站地址
  • Prepare for Medicare audits by using dat
  • 注册成为SAS用户与爱好者俱乐部会员
  • 水南梅
  • 明日烟
  • 新人报道
  • 下载
  • 厦门服务器托管,450元/月—0592-5177319 高
  • 买空间送域名--0592-5177319 高静