Want to follow news you care about.
Don't want to miss any action from premier League, Spanish League and other Leagues.
Want to make app with your own layout.
Check out NTyles.
Don't want to miss any action from premier League, Spanish League and other Leagues.
Want to make app with your own layout.
Check out NTyles.
Get it on....
ORyou can download it from here .
//Indexer.java /* * To change this template, choose Tools | Templates * and open the template in the editor. */ package com.blogspot.computergodzilla; /*The following code snippet uses APACHE LUCENE 3.4.0*/ import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * This class will build index of the source location specified into the * destination specified.This class will only index txt files. * * @author Mubin Shrestha */ public class Indexer { private final String sourceFilePath = "H:/FolderToIndex"; //give the location of the source files location here private final String indexFilePath = "H:/INDEXDIRECTORY"; //give the location where you guys want to create index private IndexWriter writer = null; private File indexDirectory = null; /** * Constructor * @throws FileNotFoundException * @throws CorruptIndexException * @throws IOException */ private Indexer() throws FileNotFoundException, CorruptIndexException, IOException { try { long start = System.currentTimeMillis(); createIndexWriter(); checkFileValidity(); closeIndexWriter(); long end = System.currentTimeMillis(); System.out.println("Total Document Indexed : " + TotalDocumentsIndexed()); System.out.println("Total time" + (end - start) / (100 * 60)); } catch (Exception e) { System.out.println("Sorry task cannot be completed"); } } /** * IndexWriter writes the data to the index. * @param analyzer : its a standard analyzer, in this case it filters out * englishStopWords and also analyses TFIDF */ private void createIndexWriter() { try { indexDirectory = new File(indexFilePath); if (!indexDirectory.exists()) { indexDirectory.mkdir(); } FSDirectory dir = FSDirectory.open(indexDirectory); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_34, analyzer); writer = new IndexWriter(dir, config); } catch (Exception ex) { System.out.println("Sorry cannot get the index writer"); } } /** * Filters out the files that can be indexed. */ private void checkFileValidity() { File[] filesToIndex = new File[100]; // suppose there are 100 files at max filesToIndex = new File(sourceFilePath).listFiles(); for (File file : filesToIndex) { try { //to check whenther the file is a readable file or not. if (!file.isDirectory() && !file.isHidden() && file.exists() && file.canRead() && file.length() > 0.0 && file.isFile() && file.getName().endsWith(".txt")) { System.out.println(); System.out.println("INDEXING FILE " + file.getAbsolutePath() + "......"); indexTextFiles(file); System.out.println("INDEXED FILE " + file.getAbsolutePath() + " :-) "); } } catch (Exception e) { System.out.println("Sorry cannot index " + file.getAbsolutePath()); } } } /** * writes file to index * @param file : file to index * @throws CorruptIndexException * @throws IOException */ private void indexTextFiles(File file) throws CorruptIndexException, IOException { Document doc = new Document(); doc.add(new Field("content", new FileReader(file))); doc.add(new Field("filename", file.getName(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("fullpath", file.getAbsolutePath(), Field.Store.YES, Field.Index.ANALYZED)); if (doc != null) { writer.addDocument(doc); } } /** * * @return : total number of documents in the index */ private int TotalDocumentsIndexed() { try { IndexReader reader = IndexReader.open(FSDirectory.open(indexDirectory)); return reader.maxDoc(); } catch (Exception ex) { System.out.println("Sorry no index found"); } return 0; } /** * Closes the IndexWriter */ private void closeIndexWriter() { try { writer.optimize(); writer.close(); } catch (Exception e) { System.out.println("Indexer Cannot be closed"); } } /** * Main method */ public static void main(String arg[]) { try { new Indexer(); } catch (Exception ex) { System.out.println("Cannot Start :("); } } }
thanks
ReplyDeleteits realyy helpful for me
Hi, Can you please help me with the code for finding the text from multiple PDF files for text search engine?
ReplyDeleteAlso,This post is very helpful.Thanks!