diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java index ade1be040a..43e3981232 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TextLanguageIdentifier.java @@ -13,12 +13,13 @@ import org.sleuthkit.datamodel.AbstractFile; public interface TextLanguageIdentifier { /** - * attempts to identify the language of the given String and add it to the black board for the given {@code AbstractFile} - * as a TSK_TEXT_LANGUAGE attribute on a TSK_GEN_INFO artifact. + * attempts to identify the language of the given String and add it to the + * black board for the given {@code AbstractFile} as a TSK_TEXT_LANGUAGE + * attribute on a TSK_GEN_INFO artifact. * - * @param extracted the String whose language is to be identified + * @param extracted the String whose language is to be identified * @param sourceFile the AbstractFile the string is extracted from. * @return */ - public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile); + public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile); } diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java index 011da893c0..5fe6d3415c 100755 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java @@ -12,59 +12,65 @@ import org.sleuthkit.datamodel.BlackboardAttribute; import org.sleuthkit.datamodel.TskCoreException; /** - *TextLanguageIdentifier implementation based on a wrapped Tike LanguageIdentifier - * - * + * TextLanguageIdentifier implementation based on a wrapped Tike + * LanguageIdentifier + * + * * @author jmillman */ public class TikaLanguageIdentifier implements TextLanguageIdentifier { private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName()); + private static final int MIN_STRING_LENGTH = 1000; @Override public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) { + if (extracted.length() > MIN_STRING_LENGTH) { + org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted); - org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted); - - logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage() - + "with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + "confidence"); - - BlackboardArtifact genInfo; - try { - genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO); + logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage() + + " with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + " confidence"); - BlackboardAttribute textLang = new BlackboardAttribute( - BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(), - KeywordSearchIngestModule.MODULE_NAME, li.getLanguage()); + BlackboardArtifact genInfo; + try { + genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO); + + BlackboardAttribute textLang = new BlackboardAttribute( + BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(), + KeywordSearchIngestModule.MODULE_NAME, li.getLanguage()); - genInfo.addAttribute(textLang); + genInfo.addAttribute(textLang); - } catch (TskCoreException ex) { - logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex); + } catch (TskCoreException ex) { + logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex); + } + + + + + }else{ + logger.info("extracted text too short, skipping language detection on " + sourceFile.getName()); } + } /* //attempt to verify that artifact with attribute was created + ArrayList arts; + + + try { + arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId()); + + for (BlackboardArtifact art : arts) { + + List attrs = art.getAttributes(); + for (BlackboardAttribute attr : attrs) { + if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) { + logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName()); + break; + } + } + } + } catch (TskCoreException ex) { + Exceptions.printStackTrace(ex); + }*/ - - - /* //attempt to verify that artifact with attribute was created - ArrayList arts; - - - try { - arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId()); - - for (BlackboardArtifact art : arts) { - - List attrs = art.getAttributes(); - for (BlackboardAttribute attr : attrs) { - if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) { - logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName()); - break; - } - } - } - } catch (TskCoreException ex) { - Exceptions.printStackTrace(ex); - }*/ - } -} +} \ No newline at end of file