diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java index e63545a904..49ca84aa8b 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileHtmlExtract.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2012 Basis Technology Corp. + * Copyright 2012-2013 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -59,10 +59,8 @@ import org.sleuthkit.datamodel.ReadContentInputStream; "text/javascript" //"application/xml", //"application/xml-dtd", ); - private final TikaLanguageIdentifier tikaLanguageIdentifier; AbstractFileHtmlExtract() { - tikaLanguageIdentifier = new TikaLanguageIdentifier(); this.module = KeywordSearchIngestModule.getDefault(); ingester = Server.getIngester(); } @@ -166,11 +164,6 @@ import org.sleuthkit.datamodel.ReadContentInputStream; totalRead = 0; extracted = sb.toString(); - - //attempt to identify language of extracted text and post it to the blackboard - tikaLanguageIdentifier.addLanguageToBlackBoard(extracted, sourceFile); - - //converts BOM automatically to charSet encoding byte[] encodedBytes = extracted.getBytes(outCharset); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java index 2e802586c3..38c93631ca 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/AbstractFileTikaTextExtract.java @@ -1,7 +1,7 @@ /* * Autopsy Forensic Browser * - * Copyright 2012 Basis Technology Corp. + * Copyright 2012-2013 Basis Technology Corp. * Contact: carrier sleuthkit org * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -39,16 +39,11 @@ import org.sleuthkit.autopsy.ingest.IngestModuleAbstractFile; import org.sleuthkit.datamodel.AbstractFile; import org.sleuthkit.datamodel.ReadContentInputStream; import org.apache.tika.Tika; -import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.sleuthkit.autopsy.casemodule.Case; import org.sleuthkit.autopsy.coreutils.StringExtract; import org.sleuthkit.autopsy.keywordsearch.Ingester.IngesterException; -import org.sleuthkit.datamodel.BlackboardArtifact; -import org.sleuthkit.datamodel.BlackboardAttribute; -import org.sleuthkit.datamodel.TskCoreException; /** * Extractor of text from TIKA supported AbstractFile content. Extracted text is @@ -75,11 +70,9 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract { private int numChunks = 0; //private static final String UTF16BOM = "\uFEFF"; disabled prepending of BOM private final ExecutorService tikaParseExecutor = Executors.newSingleThreadExecutor(); - private final List TIKA_SUPPORTED_TYPES = new ArrayList(); - private final TikaLanguageIdentifier tikaLanguageIdentifier; + private final List TIKA_SUPPORTED_TYPES = new ArrayList<>(); AbstractFileTikaTextExtract() { - tikaLanguageIdentifier = new TikaLanguageIdentifier(); this.module = KeywordSearchIngestModule.getDefault(); ingester = Server.getIngester(); @@ -87,7 +80,7 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract { for (MediaType mt : mediaTypes) { TIKA_SUPPORTED_TYPES.add(mt.getType() + "/" + mt.getSubtype()); } - logger.log(Level.INFO, "Tika supported media types: " + TIKA_SUPPORTED_TYPES); + logger.log(Level.INFO, "Tika supported media types: {0}", TIKA_SUPPORTED_TYPES); } @@ -138,13 +131,11 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract { try { future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS); } catch (TimeoutException te) { - tika = null; final String msg = "Exception: Tika parse timeout for content: " + sourceFile.getId() + ", " + sourceFile.getName(); KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te); logger.log(Level.WARNING, msg); throw new IngesterException(msg); } catch (Exception ex) { - tika = null; final String msg = "Exception: Unexpected exception from Tika parse task execution for file: " + sourceFile.getId() + ", " + sourceFile.getName(); KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex); logger.log(Level.WARNING, msg); @@ -221,9 +212,6 @@ class AbstractFileTikaTextExtract implements AbstractFileExtract { extracted = sb.toString(); - //attempt to identify language of extracted text and post it to the blackboard - tikaLanguageIdentifier.addLanguageToBlackBoard(extracted, sourceFile); - //converts BOM automatically to charSet encoding byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET); AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchListsAbstract.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchListsAbstract.java index 34cbea8536..c6bb580bf0 100644 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchListsAbstract.java +++ b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/KeywordSearchListsAbstract.java @@ -91,7 +91,8 @@ public abstract class KeywordSearchListsAbstract { ips.add(new Keyword("(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])", false, BlackboardAttribute.ATTRIBUTE_TYPE.TSK_IP_ADDRESS)); //email List emails = new ArrayList(); - emails.add(new Keyword("[A-Z0-9._%-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}", false, BlackboardAttribute.ATTRIBUTE_TYPE.TSK_EMAIL)); + emails.add(new Keyword("(?=.{8})[a-z0-9%+_-]+(?:\\.[a-z0-9%+_-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z]{2,4}(? urls = new ArrayList(); //urls.add(new Keyword("http://|https://|^www\\.", false, BlackboardAttribute.ATTRIBUTE_TYPE.TSK_URL)); diff --git a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java b/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java deleted file mode 100755 index 70c85f766b..0000000000 --- a/KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/TikaLanguageIdentifier.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Autopsy Forensic Browser - * - * Copyright 2013 Basis Technology Corp. - * Contact: carrier sleuthkit org - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.sleuthkit.autopsy.keywordsearch; - -import java.util.logging.Level; -import org.sleuthkit.autopsy.coreutils.Logger; -import org.sleuthkit.datamodel.AbstractFile; -import org.sleuthkit.datamodel.BlackboardArtifact; -import org.sleuthkit.datamodel.BlackboardAttribute; -import org.sleuthkit.datamodel.TskCoreException; - -/** - * TextLanguageIdentifier implementation based on a wrapped Tike - * LanguageIdentifier - */ -class TikaLanguageIdentifier implements TextLanguageIdentifier { - - private static final Logger logger = Logger.getLogger(TikaLanguageIdentifier.class.getName()); - private static final int MIN_STRING_LENGTH = 1000; - - @Override - public void addLanguageToBlackBoard(String extracted, AbstractFile sourceFile) { - if (extracted.length() > MIN_STRING_LENGTH) { - org.apache.tika.language.LanguageIdentifier li = new org.apache.tika.language.LanguageIdentifier(extracted); - - //logger.log(Level.INFO, sourceFile.getName() + " detected language: " + li.getLanguage() - // + " with " + ((li.isReasonablyCertain()) ? "HIGH" : "LOW") + " confidence"); - - BlackboardArtifact genInfo; - try { - genInfo = sourceFile.getGenInfoArtifact(); - - BlackboardAttribute textLang = new BlackboardAttribute( - BlackboardAttribute.ATTRIBUTE_TYPE.TSK_TEXT_LANGUAGE.getTypeID(), - KeywordSearchIngestModule.MODULE_NAME, li.getLanguage()); - - genInfo.addAttribute(textLang); - - } catch (TskCoreException ex) { - logger.log(Level.WARNING, "failed to add TSK_TEXT_LANGUAGE attribute to TSK_GEN_INFO artifact for file: " + sourceFile.getName(), ex); - } - - } - } -} \ No newline at end of file diff --git a/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java b/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java index b7c798e4ea..aa318570ca 100644 --- a/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java +++ b/Testing/test/qa-functional/src/org/sleuthkit/autopsy/testing/RegressionTest.java @@ -182,7 +182,7 @@ public class RegressionTest extends TestCase { public void testConfigureHash() { logger.info("Hash Configure"); - JDialog hashMainDialog = JDialogOperator.waitJDialog("Hash Database Configuration", false, false); + JDialog hashMainDialog = JDialogOperator.waitJDialog("Hash Set Configuration", false, false); JDialogOperator hashMainDialogOperator = new JDialogOperator(hashMainDialog); List databases = new ArrayList(); databases.add(System.getProperty("nsrl_path")); @@ -190,7 +190,7 @@ public class RegressionTest extends TestCase { for (String database : databases) { JButtonOperator importButtonOperator = new JButtonOperator(hashMainDialogOperator, "Import"); importButtonOperator.pushNoBlock(); - JDialog addDatabaseDialog = JDialogOperator.waitJDialog("Add Hash Database", false, false); + JDialog addDatabaseDialog = JDialogOperator.waitJDialog("Import Hash Database", false, false); JDialogOperator addDatabaseDialogOperator = new JDialogOperator(addDatabaseDialog); JButtonOperator browseButtonOperator = new JButtonOperator(addDatabaseDialogOperator, "Browse", 0); browseButtonOperator.pushNoBlock(); @@ -339,4 +339,4 @@ public class RegressionTest extends TestCase { KeywordSearchListsXML curr = KeywordSearchListsXML.getCurrent(); curr.setUseForIngest("URLs", true); } -} \ No newline at end of file +}