From 6b91b4cd0fceee19d509c052f226d12db7af24e4 Mon Sep 17 00:00:00 2001 From: Richard Cordovano Date: Mon, 3 Feb 2014 16:03:08 -0500 Subject: [PATCH] Delete TikaLanguageIdentifier class to improve ingest performance --- .../AbstractFileHtmlExtract.java | 9 +-- .../AbstractFileTikaTextExtract.java | 18 +----- .../keywordsearch/TikaLanguageIdentifier.java | 61 ------------------- 3 files changed, 4 insertions(+), 84 deletions(-) delete mode 100755 KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java index e63545a904..49ca84aa8b 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2012 Basis Technology Corp. + * Copyright 2012-2013 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -59,10 +59,8 @@ import org.sleuthkit.datamodel.ReadContentInputStream; "text/javascript" //"application/xml", //"application/xml-dtd", ); - private final TikaLanguageIdentifier tikaLanguageIdentifier; AbstractFileHtmlExtract() { - tikaLanguageIdentifier = new TikaLanguageIdentifier(); this.module = KeywordSearchIngestModule.getDefault(); ingester = Server.getIngester(); } @@ -166,11 +164,6 @@ import org.sleuthkit.datamodel.ReadContentInputStream; totalRead = 0; extracted = sb.toString(); - - //attempt to identify language of extracted text and post it to the blackboard - tikaLanguageIdentifier.addLanguageToBlackBoard(extracted, sourceFile); - - //converts BOM automatically to charSet encoding byte[] encodedBytes = extracted.getBytes(outCharset); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java index 2e802586c3..38c93631ca 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2012 Basis Technology Corp. + * Copyright 2012-2013 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -39,16 +39,11 @@ import org.sleuthkit.autopsy.ingest.IngestModuleAbstractFile; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; import org.apache.tika.Tika; -import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; -import org.sleuthkit.datamodel.BlackboardArtifact; -import org.sleuthkit.datamodel.BlackboardAttribute; -import org.sleuthkit.datamodel.TskCoreException; /** * Extractor of text from TIKA supported AbstractFile content. Extracted text is @@ -75,11 +70,9 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract { private int numChunks = 0; //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); - private final List TIKA_SUPPORTED_TYPES = new ArrayList(); - private final TikaLanguageIdentifier tikaLanguageIdentifier; + private final List TIKA_SUPPORTED_TYPES = new ArrayList<>(); AbstractFileTikaTextExtract() { - tikaLanguageIdentifier = new TikaLanguageIdentifier(); this.module = KeywordSearchIngestModule.getDefault(); ingester = Server.getIngester(); @@ -87,7 +80,7 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract { for (MediaType mt : mediaTypes) { TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype()); } - logger.log(Level.INFO, "Tika supported media types: " + TIKA_SUPPORTED_TYPES); + logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); } @@ -138,13 +131,11 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract { try { future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); } catch (TimeoutException te) { - tika = null; final String msg = "Exception: Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName(); KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te); logger.log(Level.WARNING, msg); throw new IngesterException(msg); } catch (Exception ex) { - tika = null; final String msg = "Exception: Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName(); KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex); logger.log(Level.WARNING, msg); @@ -221,9 +212,6 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract { extracted = sb.toString(); - //attempt to identify language of extracted text and post it to the blackboard - tikaLanguageIdentifier.addLanguageToBlackBoard(extracted, sourceFile); - //converts BOM automatically to charSet encoding byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java deleted file mode 100755 index 70c85f766b..0000000000 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2013 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.keywordsearch; - -import java.util.logging.Level; -import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.BlackboardArtifact; -import org.sleuthkit.datamodel.BlackboardAttribute; -import org.sleuthkit.datamodel.TskCoreException; - -/** - * TextLanguageIdentifier implementation based on a wrapped Tike - * LanguageIdentifier - */ -class TikaLanguageIdentifier implements TextLanguageIdentifier { - - private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName()); - private static final int MIN_STRING_LENGTH = 1000; - - @Override - public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) { - if (extracted.length() > MIN_STRING_LENGTH) { - org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted); - - //logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage() - // + " with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + " confidence"); - - BlackboardArtifact genInfo; - try { - genInfo = sourceFile.getGenInfoArtifact(); - - BlackboardAttribute textLang = new BlackboardAttribute( - BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(), - KeywordSearchIngestModule.MODULE_NAME, li.getLanguage()); - - genInfo.addAttribute(textLang); - - } catch (TskCoreException ex) { - logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex); - } - - } - } -} \ No newline at end of file