2012-07-24

Advanced Query in Lucene 3.6.0

除了最常用的 TermQuery 與 NumericRangeQuery 外,Lucene 還提供了許多查詢方式:
  • 所有索引查詢
  • 字首查詢
  • 字首範圍查詢
  • 布林查詢
  • 片語查詢
  • 鬼牌查詢
  • 模糊查詢
public class SearchTestCase extends TestCase {

  private static final Version VERSION = Version.LUCENE_36;
  private static final String F_TITLE = "title";
  private Directory directory = new RAMDirectory();
  private IndexWriter writer;
  private IndexReader reader;
  private IndexSearcher searcher;

  @Override
  protected void setUp() throws Exception {
    super.setUp();

    // create index
    this.writer = this.createWriter();
    // this.writer.setInfoStream(System.out);

    System.out.println("addDocument...");
    this.writer.addDocument(this.createDocument("Spring Core"));
    this.writer.addDocument(this.createDocument("SpringMVC"));
    this.writer.addDocument(this.createDocument("Spring AOP"));
    this.writer.addDocument(this.createDocument("Hibernate Core"));
    this.writer.addDocument(this.createDocument("Hibernate Search"));
    this.writer.addDocument(this.createDocument("Hibernator"));
    this.writer.addDocument(this.createDocument("Lucene Core"));
    this.writer.addDocument(this.createDocument("jQuery in action"));
    this.writer.addDocument(this.createDocument("Java in a nutshell, edition"));
    this.writer.addDocument(this.createDocument("Java in a nutshell, 2edition"));
    this.writer.addDocument(this.createDocument("Java in a nutshell, 3edition"));
    this.writer.addDocument(this.createDocument("Java in a nutshell, 4edition"));

    System.out.println("commit...");
    // 資料太多時,可以分批 commit
    // commit後(沒有close)就可以使用 IndexReader
    this.writer.commit();
    // 沒有 close 或 commit,不能使用 IndexReader
    // this.closeWriter();
  }

  private Document createDocument(String title) {
    Document doc = new Document();
    doc.add(new Field(SearchTestCase.F_TITLE, title, Field.Store.YES,
        Field.Index.ANALYZED));
    return doc;
  }

  @Override
  protected void tearDown() throws Exception {
    super.tearDown();
    this.closeWriter();
    this.closeSearcher();
  }

  /**
   * 取得所有索引,score 無用
   */
  public void testMatchAllDocsQuery() {
    System.out.println("testMatchAllDocsQuery...");
    try {
      IndexSearcher searcher = this.createSearcher();

      TopDocs hits = searcher.search(new MatchAllDocsQuery(), 100);
      this.showDocuments(searcher, hits);
      assertEquals(12, hits.totalHits);
    }
    catch (IOException e) {
      Assert.fail(e.getMessage());
    }
  }

  /**
   * 字首查詢
   */
  public void testPrefixQuery() {
    System.out.println("testPrefixQuery...");
    try {
      IndexSearcher searcher = this.createSearcher();

      // TermQuery
      TopDocs hits = searcher.search(new TermQuery(new Term(
          SearchTestCase.F_TITLE, "spring")), 100);
      this.showDocuments(searcher, hits);
      assertEquals(2, hits.totalHits);

      // PrefixQuery
      hits = searcher.search(new PrefixQuery(new Term(SearchTestCase.F_TITLE,
          "spring")), 100);
      this.showDocuments(searcher, hits);
      assertEquals(3, hits.totalHits);
    }
    catch (IOException e) {
      Assert.fail(e.getMessage());
    }
  }

  /**
   * 字首範圍查詢,必須掃過所有的索引,所以會有效能問題
   */
  public void testTermRangeQuery() {
    System.out.println("testTermRangeQuery...");
    try {
      IndexSearcher searcher = this.createSearcher();

      // 很奇怪,結尾用小寫的 L,查不到 Lucene
      TopDocs hits = searcher.search(new TermRangeQuery(SearchTestCase.F_TITLE,
          "h", "m", true, true), 100);
      this.showDocuments(searcher, hits);
      assertEquals(9, hits.totalHits);
    }
    catch (IOException e) {
      Assert.fail(e.getMessage());
    }
  }

  /**
   * 布林查詢
   */
  public void testBooleanQuery() {
    System.out.println("testBooleanQuery...");
    try {
      IndexSearcher searcher = this.createSearcher();

      // +spring core
      BooleanQuery bq = new BooleanQuery();
      // MUST - 一定要有
      bq.add(new TermQuery(new Term(SearchTestCase.F_TITLE, "spring")),
          BooleanClause.Occur.MUST);
      // SHOULD - 不一定要有,有的話 score 比較高
      bq.add(new TermQuery(new Term(SearchTestCase.F_TITLE, "core")),
          BooleanClause.Occur.SHOULD);
      TopDocs hits = searcher.search(bq, 100);
      this.showDocuments(searcher, hits);
      assertEquals(2, hits.totalHits);

      // -spring core lucene
      bq = new BooleanQuery();
      // MUST_NOT - 一定不要有
      bq.add(new TermQuery(new Term(SearchTestCase.F_TITLE, "spring")),
          BooleanClause.Occur.MUST_NOT);
      // SHOULD - 不一定要有,有的話 score 比較高
      bq.add(new TermQuery(new Term(SearchTestCase.F_TITLE, "core")),
          BooleanClause.Occur.SHOULD);
      bq.add(new TermQuery(new Term(SearchTestCase.F_TITLE, "lucene")),
          BooleanClause.Occur.SHOULD);
      hits = searcher.search(bq, 100);
      this.showDocuments(searcher, hits);
      assertEquals(2, hits.totalHits);

      // (+spring -core) and (+hibernate -core)
      // BooleanQuery 可以使用 Query,包括 BooleanQuery
      bq = new BooleanQuery();
      bq.add(new TermQuery(new Term(SearchTestCase.F_TITLE, "spring")),
          BooleanClause.Occur.MUST);
      bq.add(new TermQuery(new Term(SearchTestCase.F_TITLE, "core")),
          BooleanClause.Occur.MUST_NOT);
      BooleanQuery bq2 = new BooleanQuery();
      bq2.add(new TermQuery(new Term(SearchTestCase.F_TITLE, "hibernate")),
          BooleanClause.Occur.MUST);
      bq2.add(new TermQuery(new Term(SearchTestCase.F_TITLE, "core")),
          BooleanClause.Occur.MUST_NOT);
      BooleanQuery bq3 = new BooleanQuery();
      bq3.add(bq, BooleanClause.Occur.SHOULD);
      bq3.add(bq2, BooleanClause.Occur.SHOULD);
      hits = searcher.search(bq3, 100);
      this.showDocuments(searcher, hits);
      assertEquals(2, hits.totalHits);
    }
    catch (IOException e) {
      Assert.fail(e.getMessage());
    }
  }

  /**
   * 片語查詢
   */
  public void testPhraseQuery() {
    System.out.println("testPhraseQuery...");
    try {
      IndexSearcher searcher = this.createSearcher();

      // hibernate 與 core 緊鄰,所以 slot 為 0
      TopDocs hits = searcher.search(this.createPhraseQuery(0, new String[] {
          "hibernate", "core"
      }), 100);
      this.showDocuments(searcher, hits);
      assertEquals(1, hits.totalHits);

      // jquery 與 action 中間隔了一個字,所以 slot 為 1
      hits = searcher.search(this.createPhraseQuery(1, new String[] {
          "jquery", "action"
      }), 100);
      this.showDocuments(searcher, hits);
      assertEquals(1, hits.totalHits);
    }
    catch (IOException e) {
      Assert.fail(e.getMessage());
    }
  }

  private PhraseQuery createPhraseQuery(int slop, String[] terms) {
    PhraseQuery q = new PhraseQuery();
    q.setSlop(slop);
    for (String s : terms) {
      q.add(new Term(SearchTestCase.F_TITLE, s));
    }
    return q;
  }

  /**
   * 鬼牌查詢,必須掃過所有的索引,所以會有效能問題
   */
  public void testWildcardQuery() {
    System.out.println("testWildcardQuery...");
    try {
      IndexSearcher searcher = this.createSearcher();

      // * 表示零個以上字元,所有 spring 開頭的
      TopDocs hits = searcher.search(new WildcardQuery(new Term(
          SearchTestCase.F_TITLE, "spring*")), 100);
      this.showDocuments(searcher, hits);
      assertEquals(3, hits.totalHits);

      // * 也可以放在前面,所有 core 結尾的
      hits = searcher.search(new WildcardQuery(new Term(SearchTestCase.F_TITLE,
          "*core")), 100);
      this.showDocuments(searcher, hits);
      assertEquals(3, hits.totalHits);

      // ? 表示一個字元,所有 xedition 的
      hits = searcher.search(new WildcardQuery(new Term(SearchTestCase.F_TITLE,
          "?edition")), 100);
      this.showDocuments(searcher, hits);
      assertEquals(3, hits.totalHits);
    }
    catch (IOException e) {
      Assert.fail(e.getMessage());
    }
  }

  /**
   * 模糊查詢,必須掃過所有的索引,所以會有效能問題
   */
  public void testFuzzyQuery() {
    System.out.println("testFuzzyQuery...");
    try {
      IndexSearcher searcher = this.createSearcher();

      // hibernator 也中選
      TopDocs hits = searcher.search(new FuzzyQuery(new Term(
          SearchTestCase.F_TITLE, "hibernate")), 100);
      this.showDocuments(searcher, hits);
      assertEquals(3, hits.totalHits);
    }
    catch (IOException e) {
      Assert.fail(e.getMessage());
    }
  }

  /**
   * 倒出文章
   */
  private void showDocuments(IndexSearcher searcher, TopDocs hits)
      throws CorruptIndexException, IOException {
    Document d;
    for (ScoreDoc sd : hits.scoreDocs) {
      d = searcher.doc(sd.doc);
      System.out.println(d.get(F_TITLE) + " - " + sd.score);
    }
  }

  private IndexWriter createWriter() throws CorruptIndexException,
      LockObtainFailedException, IOException {
    IndexWriterConfig config = new IndexWriterConfig(SearchTestCase.VERSION,
        new StandardAnalyzer(SearchTestCase.VERSION));
    config.setOpenMode(OpenMode.CREATE);
    return new IndexWriter(this.directory, config);
  }

  private IndexSearcher createSearcher() throws CorruptIndexException,
      IOException {
    return new IndexSearcher(this.createReader());
  }

  private IndexReader createReader() throws CorruptIndexException, IOException {
    return IndexReader.open(this.directory);
  }

  private void closeWriter() {
    if (this.writer != null) {
      try {
        this.writer.close();
      }
      catch (IOException e) {
        e.printStackTrace();
      }
    }
  }

  private void closeSearcher() {
    this.closeReader();
    if (this.searcher != null) {
      try {
        this.searcher.close();
      }
      catch (IOException e) {
        e.printStackTrace();
      }
    }
  }

  private void closeReader() {
    if (this.reader != null) {
      try {
        this.reader.close();
      }
      catch (IOException e) {
        e.printStackTrace();
      }
    }
  }
}
---

沒有留言:

張貼留言