Search This Blog

Translate

Monday, December 24, 2012

Apache Lucene..How to use apache lucene 3.4.0 to index text files in java?


Want to follow news you care about.
Don't want to miss any action from premier League, Spanish League and other Leagues.
Want to make app with your own layout.

Check out NTyles.

Get it on....


NTyles-App




OR
you can download it from here .

//Indexer.java
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package com.blogspot.computergodzilla;


/*The following code snippet uses APACHE LUCENE 3.4.0*/
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * This class will build index of the source location specified into the
 * destination specified.This class will only index txt files.
 *
 * @author Mubin Shrestha
 */
public class Indexer {

    private final String sourceFilePath = "H:/FolderToIndex";    //give the location of the source files location here
    private final String indexFilePath = "H:/INDEXDIRECTORY";   //give the location where you guys want to create index
    private IndexWriter writer = null;
    private File indexDirectory = null;

    /**
     * Constructor
     * @throws FileNotFoundException
     * @throws CorruptIndexException
     * @throws IOException
     */
    private Indexer() throws FileNotFoundException, CorruptIndexException, IOException {
        try {
            long start = System.currentTimeMillis();
            createIndexWriter();
            checkFileValidity();
            closeIndexWriter();
            long end = System.currentTimeMillis();
            System.out.println("Total Document Indexed : " + TotalDocumentsIndexed());
            System.out.println("Total time" + (end - start) / (100 * 60));
        } catch (Exception e) {
            System.out.println("Sorry task cannot be completed");
        }
    }

    /**
     * IndexWriter writes the data to the index.
     * @param analyzer : its a standard analyzer, in this case it filters out
     * englishStopWords and also analyses TFIDF
     */
    private void createIndexWriter() {
        try {
            indexDirectory = new File(indexFilePath);
            if (!indexDirectory.exists()) {
                indexDirectory.mkdir();
            }
            FSDirectory dir = FSDirectory.open(indexDirectory);
            StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_34, analyzer);
            writer = new IndexWriter(dir, config);
        } catch (Exception ex) {
            System.out.println("Sorry cannot get the index writer");
        }
    }

    /**
     * Filters out the files that can be indexed.
     */
    private void checkFileValidity() {

        File[] filesToIndex = new File[100]; // suppose there are 100 files at max
        filesToIndex = new File(sourceFilePath).listFiles();
        for (File file : filesToIndex) {
            try {
                //to check whenther the file is a readable file or not.
                if (!file.isDirectory()
                        && !file.isHidden()
                        && file.exists()
                        && file.canRead()
                        && file.length() > 0.0
                        && file.isFile() && file.getName().endsWith(".txt")) {
                    System.out.println();
                    System.out.println("INDEXING FILE " + file.getAbsolutePath() + "......");
                    indexTextFiles(file);
                    System.out.println("INDEXED FILE " + file.getAbsolutePath() + " :-) ");
                }
            } catch (Exception e) {
                System.out.println("Sorry cannot index " + file.getAbsolutePath());
            }
        }
    }

    /**
     * writes file to index
     * @param file : file to index
     * @throws CorruptIndexException
     * @throws IOException
     */
    private void indexTextFiles(File file) throws CorruptIndexException, IOException {
        Document doc = new Document();
        doc.add(new Field("content", new FileReader(file)));
        doc.add(new Field("filename", file.getName(),
                Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new Field("fullpath", file.getAbsolutePath(),
                Field.Store.YES, Field.Index.ANALYZED));
        if (doc != null) {
            writer.addDocument(doc);
        }
    }

    /**
     *
     * @return : total number of documents in the index
     */
    private int TotalDocumentsIndexed() {
        try {
            IndexReader reader = IndexReader.open(FSDirectory.open(indexDirectory));
            return reader.maxDoc();
        } catch (Exception ex) {
            System.out.println("Sorry no index found");
        }
        return 0;
    }

    /**
     * Closes the IndexWriter
     */
    private void closeIndexWriter() {
        try {
            writer.optimize();
            writer.close();
        } catch (Exception e) {
            System.out.println("Indexer Cannot be closed");
        }
    }
     
     /**
      * Main method
      */
    public static void main(String arg[]) {
        try {
            new Indexer();
        } catch (Exception ex) {
            System.out.println("Cannot Start :(");
        }
    }
}





2 comments:

  1. thanks
    its realyy helpful for me

    ReplyDelete
  2. Hi, Can you please help me with the code for finding the text from multiple PDF files for text search engine?

    Also,This post is very helpful.Thanks!

    ReplyDelete