vik- 174 : added min string length to language detection

This commit is contained in:
jmillman 2013-09-13 17:12:50 -04:00
parent 47ce10ad90
commit 6b7df1ae47
2 changed files with 52 additions and 45 deletions

View File

@ -13,12 +13,13 @@ import org.sleuthkit.datamodel.AbstractFile;
public interface TextLanguageIdentifier { public interface TextLanguageIdentifier {
/** /**
* attempts to identify the language of the given String and add it to the black board for the given {@code AbstractFile} * attempts to identify the language of the given String and add it to the
* as a TSK_TEXT_LANGUAGE attribute on a TSK_GEN_INFO artifact. * black board for the given {@code AbstractFile} as a TSK_TEXT_LANGUAGE
* attribute on a TSK_GEN_INFO artifact.
* *
* @param extracted the String whose language is to be identified * @param extracted the String whose language is to be identified
* @param sourceFile the AbstractFile the string is extracted from. * @param sourceFile the AbstractFile the string is extracted from.
* @return * @return
*/ */
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile); public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile);
} }

View File

@ -12,7 +12,8 @@ import org.sleuthkit.datamodel.BlackboardAttribute;
import org.sleuthkit.datamodel.TskCoreException; import org.sleuthkit.datamodel.TskCoreException;
/** /**
*TextLanguageIdentifier implementation based on a wrapped Tike LanguageIdentifier * TextLanguageIdentifier implementation based on a wrapped Tike
* LanguageIdentifier
* *
* *
* @author jmillman * @author jmillman
@ -20,51 +21,56 @@ import org.sleuthkit.datamodel.TskCoreException;
public class TikaLanguageIdentifier implements TextLanguageIdentifier { public class TikaLanguageIdentifier implements TextLanguageIdentifier {
private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName()); private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName());
private static final int MIN_STRING_LENGTH = 1000;
@Override @Override
public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) { public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) {
if (extracted.length() > MIN_STRING_LENGTH) {
org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted);
org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted); logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage()
+ " with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + " confidence");
logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage() BlackboardArtifact genInfo;
+ "with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + "confidence"); try {
genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO);
BlackboardArtifact genInfo; BlackboardAttribute textLang = new BlackboardAttribute(
try { BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(),
genInfo = sourceFile.newArtifact(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO); KeywordSearchIngestModule.MODULE_NAME, li.getLanguage());
BlackboardAttribute textLang = new BlackboardAttribute(
BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(),
KeywordSearchIngestModule.MODULE_NAME, li.getLanguage());
genInfo.addAttribute(textLang); genInfo.addAttribute(textLang);
} catch (TskCoreException ex) { } catch (TskCoreException ex) {
logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex); logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex);
}
}else{
logger.info("extracted text too short, skipping language detection on " + sourceFile.getName());
} }
} /* //attempt to verify that artifact with attribute was created
ArrayList<BlackboardArtifact> arts;
try {
arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId());
/* //attempt to verify that artifact with attribute was created for (BlackboardArtifact art : arts) {
ArrayList<BlackboardArtifact> arts;
List<BlackboardAttribute> attrs = art.getAttributes();
for (BlackboardAttribute attr : attrs) {
if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) {
logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName());
break;
}
}
}
} catch (TskCoreException ex) {
Exceptions.printStackTrace(ex);
}*/
try {
arts = Case.getCurrentCase().getSleuthkitCase().getBlackboardArtifacts(BlackboardArtifact.ARTIFACT_TYPE.TSK_GEN_INFO, sourceFile.getId());
for (BlackboardArtifact art : arts) {
List<BlackboardAttribute> attrs = art.getAttributes();
for (BlackboardAttribute attr : attrs) {
if (attr.getAttributeTypeID() == BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID()) {
logger.log(Level.INFO, "succesfully added " + attr.getValueString() + " to gen info for:" + sourceFile.getName());
break;
}
}
}
} catch (TskCoreException ex) {
Exceptions.printStackTrace(ex);
}*/
}
} }