From 48cdcdd602fdd95684cccb035abdc647f2c67c41 Mon Sep 17 00:00:00 2001 From: Greg DiCristofaro Date: Tue, 5 Jan 2021 10:18:59 -0500 Subject: [PATCH] working on translation dictionary implementation --- .../localization_scripts/csvutil.py | 12 +++++++- .../localization_scripts/diffscript.py | 16 +++++++++-- .../localization_scripts/envutil.py | 1 + .../localization_scripts/fileutil.py | 12 ++------ .../localization_scripts/gitutil.py | 23 +++++++++++---- .../localization_scripts/itemchange.py | 2 ++ .../localization_scripts/langpropsutil.py | 2 ++ .../localization_scripts/languagedictutil.py | 28 +++++++++++++++---- .../localization_scripts/outputresult.py | 1 + .../localization_scripts/propentry.py | 1 + .../localization_scripts/propsutil.py | 5 +++- .../localization_scripts/tabularutil.py | 8 +++++- .../localization_scripts/updatepropsscript.py | 4 +++ 13 files changed, 89 insertions(+), 26 deletions(-) diff --git a/release_scripts/localization_scripts/csvutil.py b/release_scripts/localization_scripts/csvutil.py index e34dc8ea17..21e98cac52 100644 --- a/release_scripts/localization_scripts/csvutil.py +++ b/release_scripts/localization_scripts/csvutil.py @@ -4,7 +4,7 @@ from typing import List, Iterable, Tuple import csv import os -from fileutil import OMITTED_ADDITION, get_filename_addition, DELETED_ADDITION, FOUND_ADDITION +from fileutil import get_filename_addition from outputresult import OutputResult @@ -54,6 +54,16 @@ def csv_to_records(input_path: str, header_row: bool) -> Tuple[List[List[str]], return results, header +# For use with creating csv filenames for entries that have been omitted. +OMITTED_ADDITION = '-omitted' + +# For use with creating csv filenames for entries that have been deleted. +DELETED_ADDITION = '-deleted' + +# For translations where +FOUND_ADDITION = '-found' + + def write_results_to_csv(results: OutputResult, output_path: str): """ Writes the result of processing to the output path as a csv file. If omitted values are present, for output_path of diff --git a/release_scripts/localization_scripts/diffscript.py b/release_scripts/localization_scripts/diffscript.py index c1b04af152..3f08747f6e 100644 --- a/release_scripts/localization_scripts/diffscript.py +++ b/release_scripts/localization_scripts/diffscript.py @@ -5,12 +5,14 @@ As a consequence, it also requires git >= 1.7.0 and python >= 3.4. import sys from envutil import get_proj_dir from excelutil import write_results_to_xlsx -from gitutil import get_property_files_diff, get_git_root, get_commit_id +from gitutil import get_property_files_diff, get_git_root, get_commit_id, get_tree from itemchange import convert_to_output from csvutil import write_results_to_csv import argparse from langpropsutil import get_commit_for_language, LANG_FILENAME from outputtype import OutputType +from languagedictutil import extract_translations +from propsutil import get_lang_bundle_name, DEFAULT_PROPS_FILENAME def main(): @@ -41,7 +43,7 @@ def main(): help='Specify the path to the properties file containing key value pairs of language mapped to ' 'the commit of when bundles for that language were most recently updated.') - parser.add_argument('-t', '--translation-dict', dest='translation_dict', type=bool, required=False, default=False, + parser.add_argument('-td', '--translation-dict', dest='translation_dict', type=bool, required=False, default=False, help='If this flag is specified, a dictionary mapping original prop key values to translated ' 'values. If this flag is specified, it will ') @@ -55,7 +57,7 @@ def main(): output_type = args.output_type show_translated_col = not args.no_translated_col language_updates_file = args.language_file - + use_translation_dict = args.translation_dict lang = args.language if lang is not None: commit_1_id = get_commit_for_language(lang, language_updates_file) @@ -66,6 +68,13 @@ def main(): parser.print_help(sys.stderr) sys.exit(1) + translation_dict = None + if use_translation_dict and lang: + translation_dict = extract_translations( + file_iter=get_tree(repo_path, commit_1_id), + orig_filename=DEFAULT_PROPS_FILENAME, + translated_filename=get_lang_bundle_name(lang)) + commit_2_id = args.commit_2_id show_commits = not args.no_commits @@ -73,6 +82,7 @@ def main(): processing_result = convert_to_output(changes, commit1_id=get_commit_id(repo_path, commit_1_id) if show_commits else None, commit2_id=get_commit_id(repo_path, commit_2_id) if show_commits else None, + translation_dict=translation_dict, show_translated_col=show_translated_col, separate_deleted=True) diff --git a/release_scripts/localization_scripts/envutil.py b/release_scripts/localization_scripts/envutil.py index cec2a00eda..5275894b91 100644 --- a/release_scripts/localization_scripts/envutil.py +++ b/release_scripts/localization_scripts/envutil.py @@ -8,6 +8,7 @@ from typing import Union def get_proj_dir(path: Union[pathlib.PurePath, str] = __file__) -> str: """ Gets parent directory of this file (and subsequently, the project). + Args: path: Can be overridden to provide a different file. This will return the parent of that file in that instance. diff --git a/release_scripts/localization_scripts/fileutil.py b/release_scripts/localization_scripts/fileutil.py index 561420220e..e31b554c70 100644 --- a/release_scripts/localization_scripts/fileutil.py +++ b/release_scripts/localization_scripts/fileutil.py @@ -6,6 +6,7 @@ from pathlib import Path def get_path_pieces(orig_path: str) -> Tuple[str, Union[str, None], Union[str, None]]: """Retrieves path pieces. This is a naive approach as it determines if a file is present based on the presence of an extension. + Args: orig_path: The original path to deconstruct. @@ -27,6 +28,7 @@ def get_path_pieces(orig_path: str) -> Tuple[str, Union[str, None], Union[str, N def get_joined_path(folder: str, file_name: str) -> str: """ Gets a joined folder and filename. + Args: folder: The folder. file_name: The filename. @@ -53,16 +55,6 @@ def get_new_path(orig_path: str, new_filename: str) -> str: return str(Path(parent_dir) / Path(new_filename)) -# For use with creating csv filenames for entries that have been omitted. -OMITTED_ADDITION = '-omitted' - -# For use with creating csv filenames for entries that have been deleted. -DELETED_ADDITION = '-deleted' - -# For translations where -FOUND_ADDITION = '-found' - - def get_filename_addition(orig_path: str, filename_addition: str) -> str: """Gets filename with addition. So if item is '/path/name.ext' and the filename_addition is '-add', the new result would be '/path/name-add.ext'. diff --git a/release_scripts/localization_scripts/gitutil.py b/release_scripts/localization_scripts/gitutil.py index 43c20e2ce0..54bdefcbad 100644 --- a/release_scripts/localization_scripts/gitutil.py +++ b/release_scripts/localization_scripts/gitutil.py @@ -1,7 +1,7 @@ """Functions relating to using git and GitPython with an existing repo. """ -from git import Repo, Diff, Blob +from git import Repo, Diff, Blob, Tree from typing import List, Union, Iterator, Tuple, Any from itemchange import ItemChange, get_changed from pathlib import Path @@ -17,6 +17,7 @@ def get_git_root(child_path: str) -> str: """ Taken from https://stackoverflow.com/questions/22081209/find-the-root-of-the-git-repository-where-the-file-lives, this obtains the root path of the git repo in which this file exists. + Args: child_path: The path of a child within the repo. @@ -146,6 +147,21 @@ def list_paths(root_tree, path: Path = Path('.')) -> Iterator[Tuple[str, Blob]]: yield from list_paths(tree, path / tree.name) +def get_tree(repo_path: str, commit_id: str) -> Tree: + """ + Retrieves the tree that can be walked for files and file content at the specified commit. + + Args: + repo_path: The path to the repo or a child directory of the repo. + commit_id: The commit id. + + Returns: The tree. + """ + repo = Repo(repo_path, search_parent_directories=True) + commit = repo.commit(commit_id.strip()) + return commit.tree + + def get_property_file_entries(repo_path: str, at_commit: str = 'HEAD', property_file_extension: str = DEFAULT_PROPS_EXTENSION) -> Iterator[PropEntry]: """ @@ -157,11 +173,8 @@ def get_property_file_entries(repo_path: str, at_commit: str = 'HEAD', property_file_extension: The extension to use for scanning for property files. Returns: An iterator of PropEntry objects. - """ - repo = Repo(repo_path, search_parent_directories=True) - commit = repo.commit(at_commit.strip()) - for item in list_paths(commit.tree): + for item in get_tree(repo_path, at_commit): path, blob = item if path.endswith(property_file_extension): for key, val in get_entry_dict(get_text(blob)).items(): diff --git a/release_scripts/localization_scripts/itemchange.py b/release_scripts/localization_scripts/itemchange.py index 54213dc98b..dff8731ec3 100644 --- a/release_scripts/localization_scripts/itemchange.py +++ b/release_scripts/localization_scripts/itemchange.py @@ -58,6 +58,7 @@ class ItemChange: def get_row(self, show_translated_col: bool) -> List[str]: """Returns the list of values to be entered as a row in csv serialization. + Args: show_translated_col (bool): Whether or not the translated columns are showing; otherwise use default. @@ -92,6 +93,7 @@ def convert_to_output(items: Iterator[ItemChange], separate_deleted: bool = True) -> OutputResult: """ Converts PropEntry objects to an output result to be written to a tabular datasource. + Args: items: The PropEntry items. commit1_id: The first commit id to be shown in the header or None. diff --git a/release_scripts/localization_scripts/langpropsutil.py b/release_scripts/localization_scripts/langpropsutil.py index 463f570cc0..84af3cc4e0 100644 --- a/release_scripts/localization_scripts/langpropsutil.py +++ b/release_scripts/localization_scripts/langpropsutil.py @@ -23,6 +23,7 @@ def _get_props_path(language_updates_file: Union[str, None]): def get_commit_for_language(language: str, language_updates_file: Union[str, None] = None) -> Union[str, None]: """ Retrieves the latest commit for a particular language. + Args: language: The language key. language_updates_file: The file containing the most recent updates. If not provided, the default file located @@ -45,6 +46,7 @@ def get_commit_for_language(language: str, language_updates_file: Union[str, Non def set_commit_for_language(language: str, latest_commit: str, language_updates_file: Union[str, None] = None): """ Sets the most recent update for a language within the language updates file. + Args: language: The language key. latest_commit: The commit for how recent the language is. diff --git a/release_scripts/localization_scripts/languagedictutil.py b/release_scripts/localization_scripts/languagedictutil.py index fcf17a47a3..9f45c4c8d1 100644 --- a/release_scripts/localization_scripts/languagedictutil.py +++ b/release_scripts/localization_scripts/languagedictutil.py @@ -19,6 +19,7 @@ class FoundValue: def __init__(self, common_path, original_file, translated_file, key, orig_val, translated_val): """ Constructor. + Args: common_path: The folder common to both files. original_file: The original file path. @@ -38,15 +39,22 @@ class FoundValue: def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: str, translated_filename: str) \ -> Dict[str, FoundValue]: """ + Creates a translations dictionary based on comparing the values of keys in an original bundles file and a translated + bundles file in the same directory. For instance, if /path/to/original.properties and + /path/to/translated.properties both exist and in both files, a key-value pairing for keyA exists, the dictionary + will contain an entry mapping the original value for keyA to the translated value and other metadata for that + key. Args: - file_iter: - orig_filename: - translated_filename: + file_iter: An iterator of tuples containing the path and the content of the file. + orig_filename: The original file name (i.e. 'bundle.properties-MERGED'). + translated_filename: The translated file name (i.e. 'Bundle_ja.properties'). - Returns: + Returns: A dictionary mapping original values to translated values. """ + + # Create a dictionary mapping parent path to the file content for both original and translated files original_files: Dict[str, Tuple[str, Blob]] = dict() translated_files: Dict[str, Tuple[str, Blob]] = dict() @@ -57,8 +65,8 @@ def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: s elif file_name.strip().lower() == translated_filename.strip().lower(): translated_files[file_name] = (parent_dir, content) + # determine original and translated files with common parent folders and find common keys to_ret: Dict[str, FoundValue] = dict() - for common_folder, ((original_path, original_blob), (translated_path, translated_blob))\ in common_entries(original_files, translated_files): orig_dict = sanitize_prop_dict_keys(get_entry_dict(original_blob)) @@ -77,6 +85,15 @@ def extract_translations(file_iter: Iterator[Tuple[str, Blob]], orig_filename: s def sanitize_prop_dict_keys(dct: Dict[str, str]) -> Dict[str, str]: + """ + Sanitizes all the keys in a dictionary (i.e. strips white space and makes lower case). + + Args: + dct: The dictionary. + + Returns: The dictionary with sanitized keys. + + """ return {k.strip().lower(): v for k, v in dct.items()} @@ -89,6 +106,7 @@ def common_entries(*dcts: Dict[K, V]) -> Iterator[Tuple[K, Tuple[V, ...]]]: Taken from https://stackoverflow.com/questions/16458340/python-equivalent-of-zip-for-dictionaries, creates creates an iterator of tuples where the left value is the common key value and the right hand value is a tuple of all the matching values in order that the dictionaries were ordered in parameters. + Args: *dcts: The dictionaries in order to provide common key/values. diff --git a/release_scripts/localization_scripts/outputresult.py b/release_scripts/localization_scripts/outputresult.py index 0fc9a55ac0..f161c2670b 100644 --- a/release_scripts/localization_scripts/outputresult.py +++ b/release_scripts/localization_scripts/outputresult.py @@ -25,6 +25,7 @@ class OutputResult: style: Union[List[ColumnStyle], None] = None, freeze_first_row: bool = True): """ Constructs a ProcessingResult. + Args: results: Items to be written as results. Data will be written such that the item at row,cell will be located within result at results[row][col]. diff --git a/release_scripts/localization_scripts/propentry.py b/release_scripts/localization_scripts/propentry.py index feb0896cd5..67c715dda1 100644 --- a/release_scripts/localization_scripts/propentry.py +++ b/release_scripts/localization_scripts/propentry.py @@ -40,6 +40,7 @@ def convert_to_output(items: Iterator[PropEntry], commit_id: Union[str, None] = show_translated_col: bool = True, value_regex: Union[str, None] = None) -> OutputResult: """ Converts PropEntry objects to an output result to be written to a tabular datasource. + Args: items: The PropEntry items. commit_id: The commit id to be shown in the header or None. diff --git a/release_scripts/localization_scripts/propsutil.py b/release_scripts/localization_scripts/propsutil.py index 3de52a7966..833a445c09 100644 --- a/release_scripts/localization_scripts/propsutil.py +++ b/release_scripts/localization_scripts/propsutil.py @@ -5,9 +5,11 @@ from jproperties import Properties import os # The default extension for property files in autopsy repo - DEFAULT_PROPS_EXTENSION = 'properties-MERGED' +# The default filename for property files in autopsy repo +DEFAULT_PROPS_FILENAME = 'Bundle.{ext}'.format(ext=DEFAULT_PROPS_EXTENSION) + def get_lang_bundle_name(language: str) -> str: """ @@ -43,6 +45,7 @@ def get_entry_dict_from_path(props_path: str) -> Union[Dict[str, str], None]: """ Retrieves a dictionary mapping the properties represented in the string or None if no properties file can be found at that path. + Args: props_path: The path to the properties file. diff --git a/release_scripts/localization_scripts/tabularutil.py b/release_scripts/localization_scripts/tabularutil.py index 4862cb2e38..63f68cd9c3 100644 --- a/release_scripts/localization_scripts/tabularutil.py +++ b/release_scripts/localization_scripts/tabularutil.py @@ -32,6 +32,7 @@ def create_output_result(row_header: List[str], results: List[List[str]], """ Creates OutputResult from components. + Args: row_header: The row header. results: The results. @@ -47,4 +48,9 @@ def create_output_result(row_header: List[str], results: List[List[str]], deleted_result = [row_header] + deleted if deleted else None found_result = [row_header] + found_translation if found_translation else None - return OutputResult([row_header] + results, omitted_result, deleted_result, style) + return OutputResult( + results=[row_header] + results, + omitted=omitted_result, + deleted=deleted_result, + found=found_result, + style=style) diff --git a/release_scripts/localization_scripts/updatepropsscript.py b/release_scripts/localization_scripts/updatepropsscript.py index 0ee1fcc172..9f62aebe6b 100644 --- a/release_scripts/localization_scripts/updatepropsscript.py +++ b/release_scripts/localization_scripts/updatepropsscript.py @@ -195,6 +195,7 @@ class DataRows: deleted_results: Union[List[List[str]], None] = None): """ Creates a DataRows object. + Args: results: The 2d list of strings representing cells. header: The header row if present. @@ -208,6 +209,7 @@ class DataRows: def get_csv_rows(input_path: str, has_header: bool) -> DataRows: """ Gets rows of a csv file in a DataRows format. + Args: input_path: The input path of the file. has_header: Whether or not it has a header. @@ -222,6 +224,7 @@ def get_csv_rows(input_path: str, has_header: bool) -> DataRows: def get_xlsx_rows(input_path: str, has_header: bool, results_sheet: str, deleted_sheet: str) -> DataRows: """ Gets worksheets of an excel workbook in a DataRows format. + Args: input_path: The input path of the file. has_header: Whether or not is has a header. @@ -250,6 +253,7 @@ def get_prop_entries_from_data(datarows: DataRows, path_idx: int, key_idx: int, path_converter: Callable) -> List[PropEntry]: """ Converts a DataRows object into PropEntry objects. + Args: datarows: The DataRows object. path_idx: The index of the column containing the path.