From 2a23c34b7cbeed082c2cb032a0722aaecce858e7 Mon Sep 17 00:00:00 2001
From: Greg DiCristofaro <gregd@basistech.com>
Date: Fri, 17 Jul 2020 10:06:50 -0400
Subject: [PATCH] utf-8-sig now for csv read & write; value regex (which needs
 to be fully implemented)

---
 .../localization_scripts/allbundlesscript.py  | 15 +++++++++---
 .../localization_scripts/csvutil.py           |  6 ++---
 .../localization_scripts/diffscript.py        | 24 +++++++++++++++----
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/release_scripts/localization_scripts/allbundlesscript.py b/release_scripts/localization_scripts/allbundlesscript.py
index b9efd7b0b3..3f062424b8 100644
--- a/release_scripts/localization_scripts/allbundlesscript.py
+++ b/release_scripts/localization_scripts/allbundlesscript.py
@@ -9,10 +9,12 @@ import sys
 from envutil import get_proj_dir
 from gitutil import get_property_file_entries, get_commit_id, get_git_root
 from csvutil import records_to_csv
+from typing import Union
+import re
 import argparse
 
 
-def write_items_to_csv(repo_path: str, output_path: str, show_commit: bool):
+def write_items_to_csv(repo_path: str, output_path: str, show_commit: bool, value_regex: Union[str, None]):
     """Determines the contents of '.properties-MERGED' files and writes to a csv file.
 
     Args:
@@ -28,7 +30,8 @@ def write_items_to_csv(repo_path: str, output_path: str, show_commit: bool):
     rows = [row_header]
 
     for entry in get_property_file_entries(repo_path):
-        rows.append([entry.rel_path, entry.key, entry.value])
+        if value_regex is None or re.match(value_regex, entry.value):
+            rows.append([entry.rel_path, entry.key, entry.value])
 
     records_to_csv(output_path, rows)
 
@@ -42,13 +45,19 @@ def main():
                         help='The path to the repo.  If not specified, path of script is used.')
     parser.add_argument('-nc', '--no_commit', dest='no_commit', action='store_true', default=False,
                         required=False, help="Suppresses adding commits to the generated csv header.")
+    parser.add_argument('-vr', '--value-regex', dest='value_regex', type=str, default=None, required=False,
+                        help='Specify the regex for the property value where a regex match against the property value '
+                             'will display the key value pair in csv output (i.e. \'[a-zA-Z]\' or \'\\S\' for removing '
+                             'just whitespace items).  If this option is not specified, all key value pairs will be '
+                             'accepted.')
 
     args = parser.parse_args()
     repo_path = args.repo_path if args.repo_path is not None else get_git_root(get_proj_dir())
     output_path = args.output_path
     show_commit = not args.no_commit
+    value_regex = args.value_regex
 
-    write_items_to_csv(repo_path, output_path, show_commit)
+    write_items_to_csv(repo_path, output_path, show_commit, value_regex)
 
     sys.exit(0)
 
diff --git a/release_scripts/localization_scripts/csvutil.py b/release_scripts/localization_scripts/csvutil.py
index aa382944e1..acfe8e911f 100644
--- a/release_scripts/localization_scripts/csvutil.py
+++ b/release_scripts/localization_scripts/csvutil.py
@@ -4,6 +4,7 @@
 from typing import List, Iterable, Tuple
 import csv
 import os
+import codecs
 
 
 def records_to_csv(output_path: str, rows: Iterable[List[str]]):
@@ -20,9 +21,8 @@ def records_to_csv(output_path: str, rows: Iterable[List[str]]):
     if not os.path.exists(parent_dir):
         os.makedirs(parent_dir)
 
-    with open(output_path, 'w', encoding="utf-8", newline='') as csvfile:
+    with open(output_path, 'w', encoding="utf-8-sig", newline='') as csvfile:
         writer = csv.writer(csvfile)
-
         for row in rows:
             writer.writerow(row)
 
@@ -35,7 +35,7 @@ def csv_to_records(input_path: str, header_row: bool) -> Tuple[List[List[str]],
         header_row (bool): Whether or not there is a header row to be skipped.
     """
 
-    with open(input_path, encoding='utf-8') as csv_file:
+    with open(input_path, encoding='utf-8-sig') as csv_file:
         csv_reader = csv.reader(csv_file, delimiter=',')
 
         header = None
diff --git a/release_scripts/localization_scripts/diffscript.py b/release_scripts/localization_scripts/diffscript.py
index b8dfb522c2..fd4de08093 100644
--- a/release_scripts/localization_scripts/diffscript.py
+++ b/release_scripts/localization_scripts/diffscript.py
@@ -11,11 +11,14 @@ from itemchange import ItemChange
 from csvutil import records_to_csv
 import argparse
 import pathlib
+from typing import Union
+import re
 
 from langpropsutil import get_commit_for_language, LANG_FILENAME
 
 
-def write_diff_to_csv(repo_path: str, output_path: str, commit_1_id: str, commit_2_id: str, show_commits: bool):
+def write_diff_to_csv(repo_path: str, output_path: str, commit_1_id: str, commit_2_id: str, show_commits: bool,
+                      value_regex: Union[str, None]):
     """Determines the changes made in '.properties-MERGED' files from one commit to another commit.
 
     Args:
@@ -23,7 +26,9 @@ def write_diff_to_csv(repo_path: str, output_path: str, commit_1_id: str, commit
         output_path (str): The output path for the csv file.
         commit_1_id (str): The initial commit for the diff.
         commit_2_id (str): The latest commit for the diff.
-        show_commits (bool): show commits in the header row.
+        show_commits (bool): Show commits in the header row.
+        value_regex (Union[str, None]): If non-none, only key value pairs where the value is a regex match with this
+        value will be included.
     """
 
     row_header = ItemChange.get_headers()
@@ -32,8 +37,11 @@ def write_diff_to_csv(repo_path: str, output_path: str, commit_1_id: str, commit
 
     rows = [row_header]
 
-    rows += map(lambda item_change: item_change.get_row(),
-                get_property_files_diff(repo_path, commit_1_id, commit_2_id))
+    item_changes = get_property_files_diff(repo_path, commit_1_id, commit_2_id)
+    if value_regex is not None:
+        item_changes = filter(lambda item_change: re.match(value_regex, item_change.cur_val) is not None, item_changes)
+
+    rows += map(lambda item_change: item_change.get_row(), item_changes)
 
     records_to_csv(output_path, rows)
 
@@ -57,11 +65,17 @@ def main():
     parser.add_argument('-l', '--language', dest='language', type=str, default='HEAD', required=False,
                         help='Specify the language in order to determine the first commit to use (i.e. \'ja\' for '
                              'Japanese.  This flag overrides the first-commit flag.')
+    parser.add_argument('-vr', '--value-regex', dest='value_regex', type=str, default=None, required=False,
+                        help='Specify the regex for the property value where a regex match against the property value '
+                             'will display the key value pair in csv output (i.e. \'[a-zA-Z]\' or \'\\S\' for removing '
+                             'just whitespace items).  If this option is not specified, all key value pairs will be '
+                             'accepted.')
 
     args = parser.parse_args()
     repo_path = args.repo_path if args.repo_path is not None else get_git_root(get_proj_dir())
     output_path = args.output_path
     commit_1_id = args.commit_1_id
+    value_regex = args.value_regex
     if args.language is not None:
         commit_1_id = get_commit_for_language(args.language)
 
@@ -74,7 +88,7 @@ def main():
     commit_2_id = args.commit_2_id
     show_commits = not args.no_commits
 
-    write_diff_to_csv(repo_path, output_path, commit_1_id, commit_2_id, show_commits)
+    write_diff_to_csv(repo_path, output_path, commit_1_id, commit_2_id, show_commits, value_regex)
 
     sys.exit(0)