idok-commit AT lists.psi.ch

Subject: Commit emails of the iDok project

List archive

[idok-commit] idok commit r165 - in trunk: java/ch/idok/dmsd/impl/extractor/html lib

From: "AFS account Stadler Hans Christian" <stadler_h AT savannah.psi.ch>
To: idok-commit AT lists.psi.ch
Subject: [idok-commit] idok commit r165 - in trunk: java/ch/idok/dmsd/impl/extractor/html lib
Date: Thu, 7 Aug 2008 12:24:41 +0200
List-archive: <https://lists.web.psi.ch/pipermail/idok-commit/>
List-id: Commit emails of the iDok project <idok-commit.lists.psi.ch>

Author: stadler_h
Date: Thu Aug 7 12:24:41 2008
New Revision: 165

Log:
Changed the html parser implementation because the default sun parser does
not handle charsets properly

Added:
trunk/lib/htmlparser.jar (contents, props changed)
Modified:
trunk/java/ch/idok/dmsd/impl/extractor/html/HtmlExtractorFactory.java

Modified:
trunk/java/ch/idok/dmsd/impl/extractor/html/HtmlExtractorFactory.java
==============================================================================
--- trunk/java/ch/idok/dmsd/impl/extractor/html/HtmlExtractorFactory.java
(original)
+++ trunk/java/ch/idok/dmsd/impl/extractor/html/HtmlExtractorFactory.java
Thu Aug 7 12:24:41 2008
@@ -25,12 +25,13 @@
*/

import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
import java.util.Map;

-import javax.swing.text.html.parser.DTD;
-import javax.swing.text.html.parser.Parser;
+import org.htmlparser.Parser;
+import org.htmlparser.lexer.Lexer;
+import org.htmlparser.lexer.Page;
+import org.htmlparser.util.EncodingChangeException;
+import org.htmlparser.visitors.TextExtractingVisitor;

import ch.idok.common.errorhandling.DmsException;
import ch.idok.common.errorhandling.ErrorType;
@@ -53,7 +54,7 @@
/**
* @brief The content extractor class for html documents.
*/
- private final class Extractor extends Parser implements ContentExtractor
{
+ private final class Extractor implements ContentExtractor {

/** @brief A reference to the raw document content. */
byte[] raw;
@@ -61,13 +62,8 @@
/** @brief A reference to the document meta data. */
Map<String, String> meta;

- /** @brief The string buffer that will hold indexable content. */
- private StringBuffer sb = new StringBuffer();
-
/** @brief Constructor. */
- public Extractor(byte[] rawContent, Map<String, String> metaData)
- throws IOException {
- super(DTD.getDTD("html"));
+ public Extractor(byte[] rawContent, Map<String, String> metaData) {
raw = rawContent;
meta = metaData;
}
@@ -80,15 +76,22 @@
/** @brief Retrieve the extracted indexable plain text. */
public byte[] getText() throws DmsException {
try {
- ByteArrayInputStream bais = new ByteArrayInputStream(raw);
- parse(new InputStreamReader(bais));
- byte[] result = sb.toString().getBytes();
- return result;
+ String charset="UTF-8";
+ Page page = new Page(new ByteArrayInputStream(raw), charset);
+ Parser parser = new Parser(new Lexer(page));
+ TextExtractingVisitor visitor = new TextExtractingVisitor();
+ String textInPage = null;
+ try {
+ parser.visitAllNodesWith(visitor);
+ textInPage = visitor.getExtractedText();
+ } catch (EncodingChangeException ex) {
+ parser.reset();
+ textInPage = visitor.getExtractedText();
+ }
+ return textInPage.getBytes();
} catch (Throwable th) {
DmsException.throwIt(ErrorType.READ_DOC, this,
"Failed to read document", "", th);
- } finally {
- sb = new StringBuffer();
}
return null;
}
@@ -103,16 +106,6 @@
raw = null;
meta = null;
}
-
- /**
- * @brief Add all html text elements to the searchable plain text.
- */
- @Override
- protected void handleText(char[] text) {
- sb.append(text);
- sb.append(" ");
- }
-
}

/** @brief Constructor. */
@@ -132,14 +125,7 @@
/** @brief Create a content extractor instance. */
public ContentExtractor getContentExtractor(byte[] rawContent,
Map<String, String> metaData) throws DmsException {
- try {
return new Extractor(rawContent, metaData);
- } catch (IOException ex) {
- DmsException.throwIt(ErrorType.EXTRACTOR_CONSTRUCTION, this,
- "Cannot handle this html document",
- "Construction of the TextHtmlExtractor failed", ex);
- }
- return null;
}

/**

Added: trunk/lib/htmlparser.jar
==============================================================================
Binary file. No diff available.

[idok-commit] idok commit r165 - in trunk: java/ch/idok/dmsd/impl/extractor/html lib, AFS account Stadler Hans Christian, 08/07/2008