Apache Ant/Converting PDF to XML

From Wikibooks, open books for an open world
Jump to: navigation, search

Apache Ant Project to Extract Text From PDF[edit]

<project name="extract-text-from-pdf" default="extract-text-from-pdf">
    <description>Sample invocations of Apache Tika</description>
    <property name="lib.dir" value="../lib"/>
 
    <property name="input-pdf-file" value="myDocument.pdf"/>
    <property name="output-clean-xhtml-file" value="output-clean.xhtml"/>
    <target name="extract-text-from-pdf">
        <echo message="Extracting XML from PDF: ${input-pdf-file} to ${output-clean-xhtml-file}"/>
        <java jar="${lib.dir}/tika-app-1.3.jar" fork="true" failonerror="true"
            maxmemory="128m" input="${input-pdf-file}" output="${output-clean-xhtml-file}">
            <arg value="-x" />
        </java>
    </target>
</project>