Extract Text from docx,pptx,xlsx using Apache POI 3.9

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.SAXException;

public class FileParser{

    /**
     * This method parses the .docx files.
     *
     * @param docx
     * @throws FileNotFoundException
     * @throws IOException
     * @throws XmlException
     * @throws InvalidFormatException
     * @throws OpenXML4JException
     * @throws ParserConfigurationException
     * @throws SAXException
     */
    public void DocFileContentParser(OPCPackage docx) throws FileNotFoundException,
            IOException,
            XmlException,
            InvalidFormatException,
            OpenXML4JException,
            ParserConfigurationException,
            SAXException {
        XWPFWordExtractor xw = new XWPFWordExtractor(docx);
        System.out.println(xw.getText());
    }

    /**
     * This method parses the pptx files
     *
     * @param pptx
     * @throws FileNotFoundException
     * @throws IOException
     * @throws InvalidFormatException
     * @throws XmlException
     * @throws OpenXML4JException
     */
    public void ppFileContentParser(OPCPackage pptx) throws FileNotFoundException,
            IOException,
            InvalidFormatException,
            XmlException,
            OpenXML4JException {
        XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(pptx);
        System.out.println(xw.getText());
    }

    /**
     * This method parsed xlsx files
     *
     * @param xlsx
     * @throws FileNotFoundException
     * @throws IOException
     * @throws InvalidFormatException
     * @throws XmlException
     * @throws OpenXML4JException
     */
    public void excelContentParser(OPCPackage xlsx) throws FileNotFoundException,
            IOException,
            InvalidFormatException,
            XmlException,
            OpenXML4JException {
        XSSFExcelExtractor xe = new XSSFExcelExtractor(xlsx);
        System.out.println(xe.getText());
    }

    /**
     * main method
     *
     * @param args
     * @throws FileNotFoundException
     * @throws IOException
     * @throws XmlException
     * @throws InvalidFormatException
     * @throws OpenXML4JException
     * @throws ParserConfigurationException
     * @throws SAXException
     */
    public static void main(String args[]) throws FileNotFoundException,
            IOException,
            XmlException,
            InvalidFormatException,
            OpenXML4JException,
            ParserConfigurationException,
            SAXException {
        File file = new File("fileName"); //give your file name here of 
                                          //which you want to parse text
        FileInputStream fs = new FileInputStream(file);
        OPCPackage d = OPCPackage.open(fs);
        FileParser fp = new FileParser();
        if (file.getName().endsWith(".docx")) {
            fp.DocFileContentParser(d);
        } else if (file.getName().endsWith(".xlsx")) {
            fp.excelContentParser(d);
        } else if (file.getName().endsWith(".pptx")) {
            fp.ppFileContentParser(d);
        }
    }
}
Advertisements

3 thoughts on “Extract Text from docx,pptx,xlsx using Apache POI 3.9

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s