From 48cdcdd602fdd95684cccb035abdc647f2c67c41 Mon Sep 17 00:00:00 2001
From: Greg DiCristofaro <gregd@basistech.com>
Date: Tue, 5 Jan 2021 10:18:59 -0500
Subject: [PATCH] working on translation dictionary implementation

---
 .../localization_scripts/csvutil.py           | 12 +++++++-
 .../localization_scripts/diffscript.py        | 16 +++++++++--
 .../localization_scripts/envutil.py           |  1 +
 .../localization_scripts/fileutil.py          | 12 ++------
 .../localization_scripts/gitutil.py           | 23 +++++++++++----
 .../localization_scripts/itemchange.py        |  2 ++
 .../localization_scripts/langpropsutil.py     |  2 ++
 .../localization_scripts/languagedictutil.py  | 28 +++++++++++++++----
 .../localization_scripts/outputresult.py      |  1 +
 .../localization_scripts/propentry.py         |  1 +
 .../localization_scripts/propsutil.py         |  5 +++-
 .../localization_scripts/tabularutil.py       |  8 +++++-
 .../localization_scripts/updatepropsscript.py |  4 +++
 13 files changed, 89 insertions(+), 26 deletions(-)

diff --git a/release_scripts/localization_scripts/csvutil.py b/release_scripts/localization_scripts/csvutil.py
index e34dc8ea17..21e98cac52 100644
--- a/release_scripts/localization_scripts/csvutil.py
+++ b/release_scripts/localization_scripts/csvutil.py
@@ -4,7 +4,7 @@ from typing import List, Iterable, Tuple
 import csv
 import os
 
-from fileutil import OMITTED_ADDITION, get_filename_addition, DELETED_ADDITION, FOUND_ADDITION
+from fileutil import get_filename_addition
 from outputresult import OutputResult
 
 
@@ -54,6 +54,16 @@ def csv_to_records(input_path: str, header_row: bool) -> Tuple[List[List[str]],
         return results, header
 
 
+# For use with creating csv filenames for entries that have been omitted.
+OMITTED_ADDITION = '-omitted'
+
+# For use with creating csv filenames for entries that have been deleted.
+DELETED_ADDITION = '-deleted'
+
+# For translations where
+FOUND_ADDITION = '-found'
+
+
 def write_results_to_csv(results: OutputResult, output_path: str):
     """
     Writes the result of processing to the output path as a csv file.  If omitted values are present, for output_path of
diff --git a/release_scripts/localization_scripts/diffscript.py b/release_scripts/localization_scripts/diffscript.py
index c1b04af152..3f08747f6e 100644
--- a/release_scripts/localization_scripts/diffscript.py
+++ b/release_scripts/localization_scripts/diffscript.py
@@ -5,12 +5,14 @@ As a consequence, it also requires git >= 1.7.0 and python >= 3.4.
 import sys
 from envutil import get_proj_dir
 from excelutil import write_results_to_xlsx
-from gitutil import get_property_files_diff, get_git_root, get_commit_id
+from gitutil import get_property_files_diff, get_git_root, get_commit_id, get_tree
 from itemchange import convert_to_output
 from csvutil import write_results_to_csv
 import argparse
 from langpropsutil import get_commit_for_language, LANG_FILENAME
 from outputtype import OutputType
+from languagedictutil import extract_translations
+from propsutil import get_lang_bundle_name, DEFAULT_PROPS_FILENAME
 
 
 def main():
@@ -41,7 +43,7 @@ def main():
                         help='Specify the path to the properties file containing key value pairs of language mapped to '
                              'the commit of when bundles for that language were most recently updated.')
 
-    parser.add_argument('-t', '--translation-dict', dest='translation_dict', type=bool, required=False, default=False,
+    parser.add_argument('-td', '--translation-dict', dest='translation_dict', type=bool, required=False, default=False,
                         help='If this flag is specified, a dictionary mapping original prop key values to translated '
                              'values.  If this flag is specified, it will ')
 
@@ -55,7 +57,7 @@ def main():
     output_type = args.output_type
     show_translated_col = not args.no_translated_col
     language_updates_file = args.language_file
-
+    use_translation_dict = args.translation_dict
     lang = args.language
     if lang is not None:
         commit_1_id = get_commit_for_language(lang, language_updates_file)
@@ -66,6 +68,13 @@ def main():
         parser.print_help(sys.stderr)
         sys.exit(1)
 
+    translation_dict = None
+    if use_translation_dict and lang:
+        translation_dict = extract_translations(
+            file_iter=get_tree(repo_path, commit_1_id),
+            orig_filename=DEFAULT_PROPS_FILENAME,
+            translated_filename=get_lang_bundle_name(lang))
+
     commit_2_id = args.commit_2_id
     show_commits = not args.no_commits
 
@@ -73,6 +82,7 @@ def main():
     processing_result = convert_to_output(changes,
                                           commit1_id=get_commit_id(repo_path, commit_1_id) if show_commits else None,
                                           commit2_id=get_commit_id(repo_path, commit_2_id) if show_commits else None,
+                                          translation_dict=translation_dict,
                                           show_translated_col=show_translated_col,
                                           separate_deleted=True)
 
diff --git a/release_scripts/localization_scripts/envutil.py b/release_scripts/localization_scripts/envutil.py
index cec2a00eda..5275894b91 100644
--- a/release_scripts/localization_scripts/envutil.py
+++ b/release_scripts/localization_scripts/envutil.py
@@ -8,6 +8,7 @@ from typing import Union
 def get_proj_dir(path: Union[pathlib.PurePath, str] = __file__) -> str:
     """
     Gets parent directory of this file (and subsequently, the project).
+
     Args:
         path: Can be overridden to provide a different file.  This will return the parent of that file in that instance.
 
diff --git a/release_scripts/localization_scripts/fileutil.py b/release_scripts/localization_scripts/fileutil.py
index 561420220e..e31b554c70 100644
--- a/release_scripts/localization_scripts/fileutil.py
+++ b/release_scripts/localization_scripts/fileutil.py
@@ -6,6 +6,7 @@ from pathlib import Path
 def get_path_pieces(orig_path: str) -> Tuple[str, Union[str, None], Union[str, None]]:
     """Retrieves path pieces.  This is a naive approach as it determines if a file is present based on the
     presence of an extension.
+
     Args:
         orig_path:  The original path to deconstruct.
 
@@ -27,6 +28,7 @@ def get_path_pieces(orig_path: str) -> Tuple[str, Union[str, None], Union[str, N
 def get_joined_path(folder: str, file_name: str) -> str:
     """
     Gets a joined folder and filename.
+
     Args:
         folder: The folder.
         file_name: The filename.
@@ -53,16 +55,6 @@ def get_new_path(orig_path: str, new_filename: str) -> str:
     return str(Path(parent_dir) / Path(new_filename))
 
 
-# For use with creating csv filenames for entries that have been omitted.
-OMITTED_ADDITION = '-omitted'
-
-# For use with creating csv filenames for entries that have been deleted.
-DELETED_ADDITION = '-deleted'
-
-# For translations where
-FOUND_ADDITION = '-found'
-
-
 def get_filename_addition(orig_path: str, filename_addition: str) -> str:
     """Gets filename with addition.  So if item is '/path/name.ext' and the filename_addition is '-add', the new result
     would be '/path/name-add.ext'.
diff --git a/release_scripts/localization_scripts/gitutil.py b/release_scripts/localization_scripts/gitutil.py
index 43c20e2ce0..54bdefcbad 100644
--- a/release_scripts/localization_scripts/gitutil.py
+++ b/release_scripts/localization_scripts/gitutil.py
@@ -1,7 +1,7 @@
 """Functions relating to using git and GitPython with an existing repo.
 """
 
-from git import Repo, Diff, Blob
+from git import Repo, Diff, Blob, Tree
 from typing import List, Union, Iterator, Tuple, Any
 from itemchange import ItemChange, get_changed
 from pathlib import Path
@@ -17,6 +17,7 @@ def get_git_root(child_path: str) -> str:
     """
     Taken from https://stackoverflow.com/questions/22081209/find-the-root-of-the-git-repository-where-the-file-lives,
     this obtains the root path of the git repo in which this file exists.
+
     Args:
         child_path:  The path of a child within the repo.
 
@@ -146,6 +147,21 @@ def list_paths(root_tree, path: Path = Path('.')) -> Iterator[Tuple[str, Blob]]:
         yield from list_paths(tree, path / tree.name)
 
 
+def get_tree(repo_path: str, commit_id: str) -> Tree:
+    """
+    Retrieves the tree that can be walked for files and file content at the specified commit.
+
+    Args:
+        repo_path: The path to the repo or a child directory of the repo.
+        commit_id: The commit id.
+
+    Returns: The tree.
+    """
+    repo = Repo(repo_path, search_parent_directories=True)
+    commit = repo.commit(commit_id.strip())
+    return commit.tree
+
+
 def get_property_file_entries(repo_path: str, at_commit: str = 'HEAD',
                               property_file_extension: str = DEFAULT_PROPS_EXTENSION) -> Iterator[PropEntry]:
     """
@@ -157,11 +173,8 @@ def get_property_file_entries(repo_path: str, at_commit: str = 'HEAD',
         property_file_extension: The extension to use for scanning for property files.
 
     Returns: An iterator of PropEntry objects.
-
     """
-    repo = Repo(repo_path, search_parent_directories=True)
-    commit = repo.commit(at_commit.strip())
-    for item in list_paths(commit.tree):
+    for item in get_tree(repo_path, at_commit):
         path, blob = item
         if path.endswith(property_file_extension):
             for key, val in get_entry_dict(get_text(blob)).items():
diff --git a/release_scripts/localization_scripts/itemchange.py b/release_scripts/localization_scripts/itemchange.py
index 54213dc98b..dff8731ec3 100644
--- a/release_scripts/localization_scripts/itemchange.py
+++ b/release_scripts/localization_scripts/itemchange.py
@@ -58,6 +58,7 @@ class ItemChange:
 
     def get_row(self, show_translated_col: bool) -> List[str]:
         """Returns the list of values to be entered as a row in csv serialization.
+
         Args:
             show_translated_col (bool): Whether or not the translated columns are showing; otherwise use default.
 
@@ -92,6 +93,7 @@ def convert_to_output(items: Iterator[ItemChange],
                       separate_deleted: bool = True) -> OutputResult:
     """
     Converts PropEntry objects to an output result to be written to a tabular datasource.
+
     Args:
         items: The PropEntry items.
         commit1_id: The first commit id to be shown in the header or None.
diff --git a/release_scripts/localization_scripts/langpropsutil.py b/release_scripts/localization_scripts/langpropsutil.py
index 463f570cc0..84af3cc4e0 100644
--- a/release_scripts/localization_scripts/langpropsutil.py
+++ b/release_scripts/localization_scripts/langpropsutil.py
@@ -23,6 +23,7 @@ def _get_props_path(language_updates_file: Union[str, None]):
 def get_commit_for_language(language: str, language_updates_file: Union[str, None] = None) -> Union[str, None]:
     """
     Retrieves the latest commit for a particular language.
+
     Args:
         language: The language key.
         language_updates_file: The file containing the most recent updates.  If not provided, the default file located
@@ -45,6 +46,7 @@ def get_commit_for_language(language: str, language_updates_file: Union[str, Non
 def set_commit_for_language(language: str, latest_commit: str, language_updates_file: Union[str, None] = None):
     """
     Sets the most recent update for a language within the language updates file.
+
     Args:
         language: The language key.
         latest_commit: The commit for how recent the language is.
diff --git a/release_scripts/localization_scripts/languagedictutil.py b/release_scripts/localization_scripts/languagedictutil.py
index fcf17a47a3..9f45c4c8d1 100644
--- a/release_scripts/localization_scripts/languagedictutil.py
+++ b/release_scripts/localization_scripts/languagedictutil.py
@@ -19,6 +19,7 @@ class FoundValue:
     def __init__(self, common_path, original_file, translated_file, key, orig_val, translated_val):
         """
         Constructor.
+
         Args:
             common_path: The folder common to both files.
             original_file: The original file path.
@@ -38,15 +39,22 @@ class FoundValue:
 def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: str, translated_filename: str) \
         -> Dict[str, FoundValue]:
     """
+    Creates a translations dictionary based on comparing the values of keys in an original bundles file and a translated
+    bundles file in the same directory.  For instance, if /path/to/original.properties and
+    /path/to/translated.properties both exist and in both files, a key-value pairing for keyA exists, the dictionary
+    will contain an entry mapping the original value for keyA to the translated value and other metadata for that
+    key.
 
     Args:
-        file_iter:
-        orig_filename:
-        translated_filename:
+        file_iter: An iterator of tuples containing the path and the content of the file.
+        orig_filename: The original file name (i.e. 'bundle.properties-MERGED').
+        translated_filename: The translated file name (i.e. 'Bundle_ja.properties').
 
-    Returns:
+    Returns: A dictionary mapping original values to translated values.
 
     """
+
+    # Create a dictionary mapping parent path to the file content for both original and translated files
     original_files: Dict[str, Tuple[str, Blob]] = dict()
     translated_files: Dict[str, Tuple[str, Blob]] = dict()
 
@@ -57,8 +65,8 @@ def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: s
         elif file_name.strip().lower() == translated_filename.strip().lower():
             translated_files[file_name] = (parent_dir, content)
 
+    # determine original and translated files with common parent folders and find common keys
     to_ret: Dict[str, FoundValue] = dict()
-
     for common_folder, ((original_path, original_blob), (translated_path, translated_blob))\
             in common_entries(original_files, translated_files):
         orig_dict = sanitize_prop_dict_keys(get_entry_dict(original_blob))
@@ -77,6 +85,15 @@ def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: s
 
 
 def sanitize_prop_dict_keys(dct: Dict[str, str]) -> Dict[str, str]:
+    """
+    Sanitizes all the keys in a dictionary (i.e. strips white space and makes lower case).
+
+    Args:
+        dct: The dictionary.
+
+    Returns: The dictionary with sanitized keys.
+
+    """
     return {k.strip().lower(): v for k, v in dct.items()}
 
 
@@ -89,6 +106,7 @@ def common_entries(*dcts: Dict[K, V]) -> Iterator[Tuple[K, Tuple[V, ...]]]:
     Taken from https://stackoverflow.com/questions/16458340/python-equivalent-of-zip-for-dictionaries,
     creates creates an iterator of tuples where the left value is the common key value and the right hand value is
     a tuple of all the matching values in order that the dictionaries were ordered in parameters.
+
     Args:
         *dcts: The dictionaries in order to provide common key/values.
 
diff --git a/release_scripts/localization_scripts/outputresult.py b/release_scripts/localization_scripts/outputresult.py
index 0fc9a55ac0..f161c2670b 100644
--- a/release_scripts/localization_scripts/outputresult.py
+++ b/release_scripts/localization_scripts/outputresult.py
@@ -25,6 +25,7 @@ class OutputResult:
                  style: Union[List[ColumnStyle], None] = None, freeze_first_row: bool = True):
         """
         Constructs a ProcessingResult.
+
         Args:
             results: Items to be written as results.  Data will be written such that the item at row,cell will be
             located within result at results[row][col].
diff --git a/release_scripts/localization_scripts/propentry.py b/release_scripts/localization_scripts/propentry.py
index feb0896cd5..67c715dda1 100644
--- a/release_scripts/localization_scripts/propentry.py
+++ b/release_scripts/localization_scripts/propentry.py
@@ -40,6 +40,7 @@ def convert_to_output(items: Iterator[PropEntry], commit_id: Union[str, None] =
                       show_translated_col: bool = True, value_regex: Union[str, None] = None) -> OutputResult:
     """
     Converts PropEntry objects to an output result to be written to a tabular datasource.
+
     Args:
         items: The PropEntry items.
         commit_id: The commit id to be shown in the header or None.
diff --git a/release_scripts/localization_scripts/propsutil.py b/release_scripts/localization_scripts/propsutil.py
index 3de52a7966..833a445c09 100644
--- a/release_scripts/localization_scripts/propsutil.py
+++ b/release_scripts/localization_scripts/propsutil.py
@@ -5,9 +5,11 @@ from jproperties import Properties
 import os
 
 # The default extension for property files in autopsy repo
-
 DEFAULT_PROPS_EXTENSION = 'properties-MERGED'
 
+# The default filename for property files in autopsy repo
+DEFAULT_PROPS_FILENAME = 'Bundle.{ext}'.format(ext=DEFAULT_PROPS_EXTENSION)
+
 
 def get_lang_bundle_name(language: str) -> str:
     """
@@ -43,6 +45,7 @@ def get_entry_dict_from_path(props_path: str) -> Union[Dict[str, str], None]:
     """
     Retrieves a dictionary mapping the properties represented in the string or None if no properties file can be found
     at that path.
+
     Args:
         props_path: The path to the properties file.
 
diff --git a/release_scripts/localization_scripts/tabularutil.py b/release_scripts/localization_scripts/tabularutil.py
index 4862cb2e38..63f68cd9c3 100644
--- a/release_scripts/localization_scripts/tabularutil.py
+++ b/release_scripts/localization_scripts/tabularutil.py
@@ -32,6 +32,7 @@ def create_output_result(row_header: List[str], results: List[List[str]],
 
     """
     Creates OutputResult from components.
+
     Args:
         row_header: The row header.
         results: The results.
@@ -47,4 +48,9 @@ def create_output_result(row_header: List[str], results: List[List[str]],
     deleted_result = [row_header] + deleted if deleted else None
     found_result = [row_header] + found_translation if found_translation else None
 
-    return OutputResult([row_header] + results, omitted_result, deleted_result, style)
+    return OutputResult(
+        results=[row_header] + results,
+        omitted=omitted_result,
+        deleted=deleted_result,
+        found=found_result,
+        style=style)
diff --git a/release_scripts/localization_scripts/updatepropsscript.py b/release_scripts/localization_scripts/updatepropsscript.py
index 0ee1fcc172..9f62aebe6b 100644
--- a/release_scripts/localization_scripts/updatepropsscript.py
+++ b/release_scripts/localization_scripts/updatepropsscript.py
@@ -195,6 +195,7 @@ class DataRows:
                  deleted_results: Union[List[List[str]], None] = None):
         """
         Creates a DataRows object.
+
         Args:
             results: The 2d list of strings representing cells.
             header: The header row if present.
@@ -208,6 +209,7 @@ class DataRows:
 def get_csv_rows(input_path: str, has_header: bool) -> DataRows:
     """
     Gets rows of a csv file in a DataRows format.
+
     Args:
         input_path: The input path of the file.
         has_header: Whether or not it has a header.
@@ -222,6 +224,7 @@ def get_csv_rows(input_path: str, has_header: bool) -> DataRows:
 def get_xlsx_rows(input_path: str, has_header: bool, results_sheet: str, deleted_sheet: str) -> DataRows:
     """
     Gets worksheets of an excel workbook in a DataRows format.
+
     Args:
         input_path: The input path of the file.
         has_header: Whether or not is has a header.
@@ -250,6 +253,7 @@ def get_prop_entries_from_data(datarows: DataRows, path_idx: int, key_idx: int,
                                path_converter: Callable) -> List[PropEntry]:
     """
     Converts a DataRows object into PropEntry objects.
+
     Args:
         datarows: The DataRows object.
         path_idx: The index of the column containing the path.