Merge pull request #475 from sleuthkit/no_add_lang_2_bb

Delete TikaLanguageIdentifier class to improve ingest performance
This commit is contained in:
Richard Cordovano 2014-02-03 13:06:36 -08:00
commit a9879b7331
3 changed files with 4 additions and 84 deletions

View File

@ -1,7 +1,7 @@
/* /*
* Autopsy Forensic Browser * Autopsy Forensic Browser
* *
* Copyright 2012 Basis Technology Corp. * Copyright 2012-2013 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org * Contact: carrier <at> sleuthkit <dot> org
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -59,10 +59,8 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
"text/javascript" //"application/xml", "text/javascript" //"application/xml",
//"application/xml-dtd", //"application/xml-dtd",
); );
private final TikaLanguageIdentifier tikaLanguageIdentifier;
AbstractFileHtmlExtract() { AbstractFileHtmlExtract() {
tikaLanguageIdentifier = new TikaLanguageIdentifier();
this.module = KeywordSearchIngestModule.getDefault(); this.module = KeywordSearchIngestModule.getDefault();
ingester = Server.getIngester(); ingester = Server.getIngester();
} }
@ -166,11 +164,6 @@ import org.sleuthkit.datamodel.ReadContentInputStream;
totalRead = 0; totalRead = 0;
extracted = sb.toString(); extracted = sb.toString();
//attempt to identify language of extracted text and post it to the blackboard
tikaLanguageIdentifier.addLanguageToBlackBoard(extracted, sourceFile);
//converts BOM automatically to charSet encoding //converts BOM automatically to charSet encoding
byte[] encodedBytes = extracted.getBytes(outCharset); byte[] encodedBytes = extracted.getBytes(outCharset);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);

View File

@ -1,7 +1,7 @@
/* /*
* Autopsy Forensic Browser * Autopsy Forensic Browser
* *
* Copyright 2012 Basis Technology Corp. * Copyright 2012-2013 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org * Contact: carrier <at> sleuthkit <dot> org
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -39,16 +39,11 @@ import org.sleuthkit.autopsy.ingest.IngestModuleAbstractFile;
import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.ReadContentInputStream; import org.sleuthkit.datamodel.ReadContentInputStream;
import org.apache.tika.Tika; import org.apache.tika.Tika;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseContext;
import org.sleuthkit.autopsy.casemodule.Case;
import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.coreutils.StringExtract;
import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.TskCoreException;
/** /**
* Extractor of text from TIKA supported AbstractFile content. Extracted text is * Extractor of text from TIKA supported AbstractFile content. Extracted text is
@ -75,11 +70,9 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
private int numChunks = 0; private int numChunks = 0;
//private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM
private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor();
private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<String>(); private final List<String> TIKA_SUPPORTED_TYPES = new ArrayList<>();
private final TikaLanguageIdentifier tikaLanguageIdentifier;
AbstractFileTikaTextExtract() { AbstractFileTikaTextExtract() {
tikaLanguageIdentifier = new TikaLanguageIdentifier();
this.module = KeywordSearchIngestModule.getDefault(); this.module = KeywordSearchIngestModule.getDefault();
ingester = Server.getIngester(); ingester = Server.getIngester();
@ -87,7 +80,7 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
for (MediaType mt : mediaTypes) { for (MediaType mt : mediaTypes) {
TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype()); TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype());
} }
logger.log(Level.INFO, "Tika supported media types: " + TIKA_SUPPORTED_TYPES); logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES);
} }
@ -138,13 +131,11 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
try { try {
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
} catch (TimeoutException te) { } catch (TimeoutException te) {
tika = null;
final String msg = "Exception: Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName(); final String msg = "Exception: Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName();
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te); KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
logger.log(Level.WARNING, msg); logger.log(Level.WARNING, msg);
throw new IngesterException(msg); throw new IngesterException(msg);
} catch (Exception ex) { } catch (Exception ex) {
tika = null;
final String msg = "Exception: Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName(); final String msg = "Exception: Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName();
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex); KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
logger.log(Level.WARNING, msg); logger.log(Level.WARNING, msg);
@ -221,9 +212,6 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract {
extracted = sb.toString(); extracted = sb.toString();
//attempt to identify language of extracted text and post it to the blackboard
tikaLanguageIdentifier.addLanguageToBlackBoard(extracted, sourceFile);
//converts BOM automatically to charSet encoding //converts BOM automatically to charSet encoding
byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET); byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);

View File

@ -1,61 +0,0 @@
/*
* Autopsy Forensic Browser
*
* Copyright 2013 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.sleuthkit.datamodel.AbstractFile;
import org.sleuthkit.datamodel.BlackboardArtifact;
import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.TskCoreException;
/**
* TextLanguageIdentifier implementation based on a wrapped Tike
* LanguageIdentifier
*/
class TikaLanguageIdentifier implements TextLanguageIdentifier {
private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName());
private static final int MIN_STRING_LENGTH = 1000;
@Override
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) {
if (extracted.length() > MIN_STRING_LENGTH) {
org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted);
//logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage()
// + " with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + " confidence");
BlackboardArtifact genInfo;
try {
genInfo = sourceFile.getGenInfoArtifact();
BlackboardAttribute textLang = new BlackboardAttribute(
BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(),
KeywordSearchIngestModule.MODULE_NAME, li.getLanguage());
genInfo.addAttribute(textLang);
} catch (TskCoreException ex) {
logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex);
}
}
}
}