public class DocumentExtractor extends Object
// We have a document to process java.io.Reader document = ...; DocumentExtractor x = new DocumentExtractor(); x.processHTML(document); // or processPlainText(document) for input in plain text format // Iterate through the hits (using a Java 1.5 feature, otherwise use an java.util.Iterator) for (Hit hit : x.getHits()) { System.out.println(hit.position + ": " + hit.text + ": " + hit.structure.toFormat("smiles")); }The field hit.position contains the position of the first character of the name in the document.
Note that hit.text contains the name as it appears in the source document. A cleaned version
(of possible OCR errors, typos, ...) can be retrieved with hit.structure.getName()
.
This class can also be called on the command-line. It then expects the name of a plain text file as the first argument (or from the standard input when absent). The list of hits is printed on the standard output.
Modifier and Type | Class and Description |
---|---|
class |
DocumentExtractor.Hit
Deprecated.
An occurrence of a chemical name in the processed document.
|
static class |
DocumentExtractor.ProgressInfo
Deprecated.
|
static interface |
DocumentExtractor.ProgressListener
Deprecated.
|
Modifier and Type | Field and Description |
---|---|
static String |
propertyPage
Deprecated.
|
static String |
propertySourceDocument
Deprecated.
|
Constructor and Description |
---|
DocumentExtractor()
Deprecated.
Creates a new document extractor.
|
DocumentExtractor(File document)
Deprecated.
If the file name ends with ".gz", the content will be uncompressed automatically.
|
DocumentExtractor(File document,
String encoding)
Deprecated.
If the file name ends with ".gz", the content will be uncompressed automatically.
|
DocumentExtractor(Reader r)
Deprecated.
|
DocumentExtractor(String text)
Deprecated.
Extract structures from a String.
|
DocumentExtractor(URL document)
Deprecated.
|
DocumentExtractor(URLConnection document)
Deprecated.
Create a document extractor for the given URL connection.
|
Modifier and Type | Method and Description |
---|---|
void |
acceptElements(boolean on)
Deprecated.
|
void |
acceptGenericNames(boolean on)
Deprecated.
Whether to accept generic, frequent names like "water".
|
void |
acceptGroups(boolean on)
Deprecated.
|
void |
acceptIons(boolean on)
Deprecated.
|
void |
clearHits()
Deprecated.
Clears the list of hits.
|
List<DocumentExtractor.Hit> |
getHits()
Deprecated.
Returns the hits found in the documents processed so far.
|
static void |
main(String[] args)
Deprecated.
Expects the name of a plain text file as the first argument
(or from the standard input when absent).
|
static void |
printEncodingError()
Deprecated.
|
void |
processHTML()
Deprecated.
Extract names from an HTML document.
|
void |
processHTML(DocumentExtractor.ProgressListener progressListener)
Deprecated.
Extract names from an HTML document.
|
void |
processHTML(Reader r)
Deprecated.
Extract names from an HTML document.
|
void |
processPlainText()
Deprecated.
Extract names from a plain text document.
|
void |
processPlainText(DocumentExtractor.ProgressListener progressListener)
Deprecated.
Extract names from a plain text document.
|
void |
processPlainText(Reader r)
Deprecated.
Extract names from a plain text document.
|
static DocumentExtractor |
readPDF(File pdf)
Deprecated.
Creates a DocumentExtractor to process the given PDF document.
|
static DocumentExtractor |
readPDF(InputStream pdfStream)
Deprecated.
Creates a DocumentExtractor to process the given PDF document.
|
void |
setCasNumberLookup(boolean value)
Deprecated.
Enable or disable the lookup of CAS numbers (requires network access).
|
public static final String propertySourceDocument
public static final String propertyPage
public DocumentExtractor()
public DocumentExtractor(File document) throws IOException
IOException
public DocumentExtractor(File document, String encoding) throws IOException
IOException
public DocumentExtractor(URL document) throws IOException
IOException
public DocumentExtractor(URLConnection document) throws IOException
IOException
public DocumentExtractor(Reader r)
public DocumentExtractor(String text)
public void setCasNumberLookup(boolean value)
public void acceptElements(boolean on)
public void acceptIons(boolean on)
public void acceptGenericNames(boolean on)
public void acceptGroups(boolean on)
public static void main(String[] args)
public static void printEncodingError()
public void processPlainText(Reader r) throws IOException
IOException
public void processPlainText() throws IOException
IOException
public void processPlainText(DocumentExtractor.ProgressListener progressListener) throws IOException
IOException
public void processHTML(Reader r) throws IOException
IOException
public void processHTML() throws IOException
IOException
public void processHTML(DocumentExtractor.ProgressListener progressListener) throws IOException
IOException
public List<DocumentExtractor.Hit> getHits()
public void clearHits()
public static DocumentExtractor readPDF(File pdf) throws IOException
IOException
public static DocumentExtractor readPDF(InputStream pdfStream) throws IOException
IOException