Skip to Content.
Sympa Menu

idok-commit - [idok-commit] idok commit r349 - trunk/java/ch/idok/dmsd/impl/extractor/microsoft

idok-commit AT lists.psi.ch

Subject: Commit emails of the iDok project

List archive

[idok-commit] idok commit r349 - trunk/java/ch/idok/dmsd/impl/extractor/microsoft


Chronological Thread 
  • From: "AFS account Stadler Hans Christian" <stadler_h AT savannah.psi.ch>
  • To: idok-commit AT lists.psi.ch
  • Subject: [idok-commit] idok commit r349 - trunk/java/ch/idok/dmsd/impl/extractor/microsoft
  • Date: Mon, 30 Mar 2009 15:02:23 +0200
  • List-archive: <https://lists.web.psi.ch/pipermail/idok-commit/>
  • List-id: Commit emails of the iDok project <idok-commit.lists.psi.ch>

Author: stadler_h
Date: Mon Mar 30 15:02:23 2009
New Revision: 349

Log:
Retry on OO failure, better logging

Modified:
trunk/java/ch/idok/dmsd/impl/extractor/microsoft/MSExtractorProcess.java

trunk/java/ch/idok/dmsd/impl/extractor/microsoft/MSOfficeExtractorFactory.java
trunk/java/ch/idok/dmsd/impl/extractor/microsoft/OOProcess.java
trunk/java/ch/idok/dmsd/impl/extractor/microsoft/ProcessTimeout.java
trunk/java/ch/idok/dmsd/impl/extractor/microsoft/ThreadTimeout.java

Modified:
trunk/java/ch/idok/dmsd/impl/extractor/microsoft/MSExtractorProcess.java
==============================================================================
--- trunk/java/ch/idok/dmsd/impl/extractor/microsoft/MSExtractorProcess.java
(original)
+++ trunk/java/ch/idok/dmsd/impl/extractor/microsoft/MSExtractorProcess.java
Mon Mar 30 15:02:23 2009
@@ -196,7 +196,7 @@
ErrorType.DOC_HANDLING,
this,
"Cannot handle document",
- "Error during extracting text from cells",
+ "Error during extracting text from presentation",
th);
}
}
@@ -236,9 +236,9 @@
logger.finest("Printed document text to stdout, exiting with
status 0");
retval = 0;
} catch (DmsException ex) {
- logger.warning(Util.mergeMessages(ex.getUserMessage(),
ex.getDetailedMessage()+"\n"+ex));
+ logger.warning(Util.mergeMessages(ex.getUserMessage(),
ex.getDetailedMessage()+"\n"+Util.stackTraceToString(ex)));
} catch (Throwable th) {
- logger.warning("Failed to extract and print document text,
exiting with status "+retval+":\n"+th);
+ logger.warning("Failed to extract and print document text,
exiting with status "+retval+":\n"+Util.stackTraceToString(th));
}
System.exit(retval);
}

Modified:
trunk/java/ch/idok/dmsd/impl/extractor/microsoft/MSOfficeExtractorFactory.java
==============================================================================
---
trunk/java/ch/idok/dmsd/impl/extractor/microsoft/MSOfficeExtractorFactory.java
(original)
+++
trunk/java/ch/idok/dmsd/impl/extractor/microsoft/MSOfficeExtractorFactory.java
Mon Mar 30 15:02:23 2009
@@ -73,36 +73,52 @@
*
* OpenOffice seems to leak file descriptors.
*/
- int ooRestartCounter;
+ long ooRestartCounter;

/**
* @brief Restart threshold for OpenOffice.
*
* After that many extractions, OpenOffice will be restarted.
+ *
+ * Default 40. Can be set explicitly using the
+ * ch.idok.dmsd.impl.extractor.microsoft.ooRestartThreshold
+ * property.
*/
- private static final int ooRestartThreshold = 40;
+ static long ooRestartThreshold = 40;

/**
* @brief Maximum time for finishing text extraction in milliseconds
*
- * Default is 10s, can be changed using the
- * ch.idok.dmsd.impl.extractor.microsoft.interruptdelay
+ * Default 30s. Can be set explicitly using the
+ * ch.idok.dmsd.impl.extractor.microsoft.interruptDelay
* property.
*/
- static long interruptDelay = 5000;
+ static long interruptDelay = 30000;

/**
* @brief Maximum delay in ms for process shutdown
+ *
+ * Default 10s. Can be set explicitly using the
+ * ch.idok.dmsd.impl.extractor.microsoft.processKillTimeout
+ * property.
*/
static long processKillTimeout = 10000;

/**
- * @brief Maximum delay in ms for process startup
+ * @brief Maximum delay in ms for process startup
+ *
+ * Default 10s. Can be set explicitly using the
+ * ch.idok.dmsd.impl.extractor.microsoft.processStartupTimeout
+ * property.
*/
- private long processStartupTimeout = 5000;
+ private long processStartupTimeout = 10000;

/**
* @brief Timeout for idle input streams
+ *
+ * Default 20s. Can be set explicitly using the
+ * ch.idok.dmsd.impl.extractor.microsoft.streamReaderTimeout
+ * property.
*/
long streamReaderTimeout = 20000;

@@ -236,33 +252,78 @@
} while (true);
}

- /** @brief Retrieve the searchable plain text. */
+ /**
+ * @brief Retrieve the searchable plain text.
+ *
+ * Will attempt to do one retry if it fails.
+ */
public byte[] getText() throws DmsException {
DmsException exception = null;
+ byte[] res = null;
+ File tf = null;
+ FileOutputStream fos = null;
+
+ try {
+ // Save the document in a temporary file
+ tf = File.createTempFile("Indexer", getDesc().fileExtension);
+ fos = new FileOutputStream(tf);
+ fos.write(raw);
+ fos.close();
+ fos = null;
+
+ boolean retry = false;
+ do {
+ try {
+ res = tryGetText(tf.getCanonicalPath());
+ return res;
+ } catch (DmsException ex) {
+ if (retry)
+ throw ex;
+ logger.warning("Exception while extracting text\n" +
Util.stackTraceToString(ex)
+ + "\nRetrying...");
+ System.gc();
+ retry = true;
+ }
+ } while (retry);
+ DmsException.throwIt(ErrorType.INTERNAL, this, "Bug
detected", "No exception after second retry");
+ } catch (DmsException ex) {
+ exception = ex;
+ } catch (Throwable th) {
+ exception = new DmsException(ErrorType.DOC_HANDLING, this,
+ "Failed to handle document",
Util.stackTraceToString(th));
+ } finally {
+ if (fos != null)
+ try {
+ fos.close();
+ } catch (Throwable th) {
+ } // Ignore IOException
+ if (tf != null)
+ try {
+ tf.delete();
+ } catch (Throwable th) {
+ } // Ignore SecurityException
+
+ }
+ dispose();
+ throw exception;
+ }
+
+ /** @brief Try to retrieve the searchable plain text. */
+ private byte[] tryGetText(String doc) throws DmsException {
+ DmsException exception = null;
if ((soffice == null) || (++ooRestartCounter >=
ooRestartThreshold)) {
logger.finer("Restarting OpenOffice");
initOO();
}
- File tf = null;
- FileOutputStream fos = null;
InputStream is = null;
Timer timer = null;
Thread errThread = null;
Process proc = null;
+ ProcessTimeout pto = null;

try {
- // Save the document in a temporary file
- tf = File.createTempFile("Indexer", "." +
getDesc().fileExtension);
- fos = new FileOutputStream(tf);
- fos.write(raw);
- fos.close();
- fos = null;
- // Start the timer that kills the extraction process after 3
seconds
- // if the extraction fails to produce a result and start the
extraction process
- timer = new Timer("InterruptTimer for thread
"+Thread.currentThread().getName());
+ // Start the extraction process
String cls = MSExtractorProcess.class.getCanonicalName();
- //
"ch.idok.dmsd.impl.extractor.microsoft.MSExtractorProcess";
- String doc = tf.getCanonicalPath();
String tp = Integer.toString(type);
String level = Setup.getSetup().getLogLevel().toString();
ProcessBuilder procBuilder = new ProcessBuilder(new
ProcessArgs(logger, cls, doc,tp, level));
@@ -274,8 +335,11 @@
InputReader errOut = new InputReader(proc.getErrorStream());
errThread = new Thread(errOut, "StdErrReader-getText");
errThread.start();
- // Start the timeout thread
- timer.schedule(new ProcessTimeout(proc, logger),
interruptDelay);
+ // Start the timeout thread that kills the extraction
process after interruptDelay
+ // if the extraction fails to produce a result
+ timer = new Timer("InterruptTimer for thread
"+Thread.currentThread().getName());
+ pto = new ProcessTimeout(proc, logger);
+ timer.schedule(pto, interruptDelay);
// Read the stdout output of the extraction process
is = proc.getInputStream();
int length = readLength(is)+1; // allow for 1 character of
slack
@@ -311,7 +375,7 @@
if (errOut.throwable != null)
throw errOut.throwable;
} catch (Throwable th) {
- logger.warning("Failed to retrieve error stream of MS
Office Extractor process\n" + th);
+ logger.warning("Failed to retrieve error stream of MS
Office Extractor process\n" + Util.stackTraceToString(th));
}
if (rval != 0)
DmsException.throwIt(ErrorType.DOC_HANDLING, this,
"Cannot handle document.", "MS Office Extractor Process terminated with
status "+rval);
@@ -320,7 +384,7 @@
exception = ex;
} catch (Throwable th) {
exception = new DmsException(ErrorType.DOC_HANDLING, this,
- "Failed to handle document",
Util.stackTraceToString(th), th);
+ "Failed to handle document", "Text extraction
failed", th);
} finally {
if (timer != null)
timer.cancel();
@@ -331,24 +395,14 @@
errThread.join(100);
} catch (Throwable th) {
} // Ignore InterruptedException
- if (fos != null)
- try {
- fos.close();
- } catch (Throwable th) {
- } // Ignore IOException
- if (tf != null)
- try {
- tf.delete();
- } catch (Throwable th) {
- } // Ignore SecurityException
if (is != null)
try {
is.close();
} catch (Throwable th) {
} // Ignore IOException
- if (Thread.interrupted()) {
- logger.finest("Thread
"+Thread.currentThread().getName()+" was interrupted");
- dispose();
+ if (pto != null && pto.fired) {
+ exception = new DmsException(ErrorType.DOC_HANDLING,
this, "Failed to handle document",
+ "Extractor process was interrupted due to
timeout", exception);
}
}
try { killOO(); } catch (Throwable th) {}
@@ -408,7 +462,7 @@
}
logger.finest("OO control process terminated with exit value " +
soffice.exitValue());
} catch (Throwable th) {
- logger.severe("Internal bug detected\n" + th);
+ logger.severe("Internal bug detected\n" +
Util.stackTraceToString(th));
} finally {
if (timer != null)
timer.cancel();
@@ -470,7 +524,7 @@
if (soffice != null)
soffice.destroy();
soffice = null;
- }
+ }
}

/**
@@ -494,6 +548,7 @@
try {
logger = config.getLogger("dmsd.impl.extractor.microsoft");
logger.finest("Initializing extractor factory for MS Office
Documents.");
+ ooRestartThreshold =
getLong("ch.idok.dmsd.impl.extractor.microsoft.ooRestartThreshold",
ooRestartThreshold);
interruptDelay =
getLong("ch.idok.dmsd.impl.extractor.microsoft.interruptDelay",
interruptDelay);
processKillTimeout =
getLong("ch.idok.dmsd.impl.extractor.microsoft.processKillTimeout",
processKillTimeout);
processStartupTimeout =
getLong("ch.idok.dmsd.impl.extractor.microsoft.processStartupTimeout",
processStartupTimeout);
@@ -517,7 +572,7 @@
} catch (DmsException ex) {
throw ex;
} catch (Throwable th) {
- DmsException.throwIt(ErrorType.INTERNAL, this, "Bug detected",
Util.stackTraceToString(th), th);
+ DmsException.throwIt(ErrorType.INTERNAL, this, "Bug detected",
Util.stackTraceToString(th));
}
}


Modified: trunk/java/ch/idok/dmsd/impl/extractor/microsoft/OOProcess.java
==============================================================================
--- trunk/java/ch/idok/dmsd/impl/extractor/microsoft/OOProcess.java
(original)
+++ trunk/java/ch/idok/dmsd/impl/extractor/microsoft/OOProcess.java Mon
Mar 30 15:02:23 2009
@@ -180,7 +180,7 @@
} while(true);
retval = 10;
} catch (IOException ex) {
- logger.warning("Exception while processing Indexer
commands\n"+ex);
+ logger.warning("Exception while processing Indexer
commands\n"+Util.stackTraceToString(ex));
}
ooproc.stop();
retval = 0;
@@ -270,7 +270,7 @@
timer.cancel();
Thread.sleep(desktopTerminationDelay);
} catch (Throwable th) {
- logger.finer("Can't terminate OO desktop\n"+th);
+ logger.finer("Can't terminate OO
desktop\n"+Util.stackTraceToString(th));
}finally {
desktop = null;
timer.cancel();
@@ -339,7 +339,7 @@
}
timer.cancel();
} catch (Throwable th) {
- logger.warning("Failed to establish initial OO connection\n"
+ th);
+ logger.warning("Failed to establish initial OO connection\n"
+ Util.stackTraceToString(th));
}

if (unoObj == null) {
@@ -360,7 +360,8 @@
int retval = soffice.exitValue();
logger.warning("OpenOffice exited with return value " +
retval + "\n" + connectEx + "\n" + sb);
} catch (Throwable th) {
- logger.severe("Unexpected exception while killing OO " +
th + "\nOO output\n" + sb);
+ logger.severe("Unexpected exception while killing OO\n"
+ Util.stackTraceToString(th)
+ + "\nOO output\n" + sb);
} finally {
soffice = null;
sb = null;

Modified: trunk/java/ch/idok/dmsd/impl/extractor/microsoft/ProcessTimeout.java
==============================================================================
--- trunk/java/ch/idok/dmsd/impl/extractor/microsoft/ProcessTimeout.java
(original)
+++ trunk/java/ch/idok/dmsd/impl/extractor/microsoft/ProcessTimeout.java
Mon Mar 30 15:02:23 2009
@@ -22,17 +22,36 @@
import java.util.TimerTask;
import java.util.logging.Logger;

+/**
+ * @brief Timer task for killing a processes
+ */
class ProcessTimeout extends TimerTask {
+ /** @brief The process to kill */
private Process process;
+ /** @brief Logger object */
private Logger logger;
+ /** @brief Has the timer task fired? */
+ public volatile boolean fired;
+
+ /**
+ * @brief Constructor
+ *
+ * @param proc Process to kill
+ * @param log Logger object
+ */
ProcessTimeout(Process proc, Logger log) {
+ fired = false;
process = proc;
logger = log;
}

+ /**
+ * @brief Kill the process
+ */
@Override
public void run() {
process.destroy();
- logger.fine("Killed process " + process + " due to lack of
progress");
+ fired = true;
+ logger.warning("Killed process " + process + " due to lack of
progress");
}
}

Modified: trunk/java/ch/idok/dmsd/impl/extractor/microsoft/ThreadTimeout.java
==============================================================================
--- trunk/java/ch/idok/dmsd/impl/extractor/microsoft/ThreadTimeout.java
(original)
+++ trunk/java/ch/idok/dmsd/impl/extractor/microsoft/ThreadTimeout.java Mon
Mar 30 15:02:23 2009
@@ -26,16 +26,32 @@
* @brief Timer task for interrupting a thread
*/
class ThreadTimeout extends TimerTask {
+ /** @brief Thread to interrupt */
private Thread snail;
+ /** @brief Logger object */
private Logger logger;
+ /** @brief Did this timer task fire? */
+ public volatile boolean fired;
+
+ /**
+ * @brief Constructor
+ *
+ * @param thread Thread to interrupt
+ * @param log Logger object
+ */
ThreadTimeout(Thread thread, Logger log) {
+ fired = false;
snail = thread;
- logger = log;
+ logger = log;
}

+ /**
+ * @brief Interrupt the thread
+ */
@Override
public void run() {
snail.interrupt();
- logger.fine("Interrupted thread " + snail + "due to lack of
progress");
+ fired = true;
+ logger.warning("Interrupted thread " + snail + "due to lack of
progress");
}
}



  • [idok-commit] idok commit r349 - trunk/java/ch/idok/dmsd/impl/extractor/microsoft, AFS account Stadler Hans Christian, 03/30/2009

Archive powered by MHonArc 2.6.19.

Top of Page