Quantcast
Channel: SCN : All Content - Process Integration (PI) & SOA Middleware
Viewing all articles
Browse latest Browse all 7030

Read Pdf file to XML via java mapping

$
0
0

Hi

 

the scenario is File to Proxy, i have to read a pdf files content(all text) i have written the code

 

import java.io.IOException;

import java.io.FileReader;

import java.io.BufferedReader;

import java.io.*;

 

 

import org.apache.pdfbox.util.*;

import org.apache.pdfbox.pdmodel.*;

 

 

class ReadPdf

{

 

 

  public static void main(String args[])

  {

 

 

 

 

    PDDocument pd;

    BufferedWriter wr;

 

 

 

 

    try {

        File input = new File("original.pdf");  // The PDF file from where you would like to extract

          File output = new File("SampleText.txt"); // The text file where you are going to store the extracted data

          pd = PDDocument.load(input);

          System.out.println(pd.getNumberOfPages()); //prints number of pages

          System.out.println(pd.isEncrypted()); //false as not encrypted

          pd.save("CopyOfOriginal.pdf"); // Creates a copy called "CopyOforiginal.pdf"

          PDFTextStripper stripper = new PDFTextStripper();

          stripper.setStartPage(1); //Start extracting from page 1

          stripper.setEndPage(1); //Extract till page 1

          wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));

          stripper.writeText(pd, wr);

          if (pd != null) {

              pd.close();

          }

          // I use close() to flush the stream.

          wr.close();

 

 

 

 

  }

  catch (Exception e)

  {

        e.printStackTrace();

         }

 

 

 

 

 

 

 

 

 

 

  }

 

 

 

 

}

 

 

 

 

it works i have modified it to work in java mapping as

 

import java.io.InputStream;

import java.io.OutputStream;

import java.util.Map;

import java.util.HashMap;

 

 

import java.io.IOException;

import java.io.FileReader;

import java.io.BufferedReader;

import java.io.*;

 

 

import org.apache.pdfbox.util.*;

import org.apache.pdfbox.pdmodel.*;

 

 

 

 

import com.sap.aii.mapping.api.AbstractTransformation;

import com.sap.aii.mapping.api.StreamTransformationException;

import com.sap.aii.mapping.api.TransformationInput;

import com.sap.aii.mapping.api.TransformationOutput;

 

 

public class PdftoXml extends AbstractTransformation

{

  public void transform(TransformationInput in, TransformationOutput out) throws StreamTransformationException

  {

 

 

   

 

 

    PDDocument pd;

    BufferedWriter wr;

 

 

 

 

    try {

      

          pd = PDDocument.load(in.getInputPayload().getInputStream()); //convert Tranformationimput to inputstream than pass it to PDDocument constructor to read Pdf from Inputstream.

 

 

          //System.out.println(pd.getNumberOfPages()); //prints number of pages

        

        

          PDFTextStripper stripper = new PDFTextStripper();

          stripper.setStartPage(1); //Start extracting from page 1

          stripper.setEndPage(1); //Extract till page 1

 

 

 

 

  String str = stripper.getText(pd);

  String content[] = str.split("\n");

 

 

 

 

  String result ="<?xml version=\"1.0\" encoding=\"UTF-8\"?>";

  result = result.concat("<ns0:MTPdf xmlns:ns0=\"urn:mmm-com:pi:Vinay:10\">");

  result = result.concat("<field1>"+content[0]+"</field1>");

  result = result.concat("<field2>"+content[1]+"</field1>");

  result = result.concat("<field3>"+content[2]+"</field1>");

  result = result.concat("<field4>"+content[3]+"</field1>");

 

 

  result = result.concat("</ns0:MTPdf>");

 

 

  out.getOutputPayload().getOutputStream().write(result.getBytes("UTF-8")); //writing to output

 

 

 

 

 

 

  }

  catch (Exception e)

  {

        e.printStackTrace();

         }

 

 

 

 

 

 

  }

}

 

 

i am using apache third party API "PdfBox" where shall i import this API in ESR for my java mapping to work


Viewing all articles
Browse latest Browse all 7030

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>