Added changed_lines.py: extracts changed lines from git diff

Added a file changed_lines.py to take as input a `git diff` output
and extract the lines that have been changed for every modified file.

We're hoping to use this in conjunction with lcov_parser.py
(see https://kunit-review.googlesource.com/c/prow-presubmit/+/2269)
to compute incremental coverage for KUnit.

Signed-off-by: Darya Verzhbinsky <daryaver@google.com>
Change-Id: Iaff0a6089f0f020bda0918f5bdcb29a8c3b7d922
diff --git a/BUILD.bazel b/BUILD.bazel
index a6af0ed..5de783c 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -64,3 +64,15 @@
     python_version = "PY3",
     deps = ["lcov_parser/lcov_parser.py"],
 )
+
+py_library(
+    name = "get_changed_lines",
+    srcs = ["lcov_parser/get_changed_lines.py"],
+)
+
+py_test(
+    name = "get_changed_lines_test",
+    srcs = ["lcov_parser/get_changed_lines_test.py"],
+    python_version = "PY3",
+    deps = ["lcov_parser/get_changed_lines.py"],
+)
diff --git a/lcov_parser/changed_lines.py b/lcov_parser/changed_lines.py
new file mode 100644
index 0000000..1281248
--- /dev/null
+++ b/lcov_parser/changed_lines.py
@@ -0,0 +1,102 @@
+"""This module parses `git diff` output to determine the added/changed
+lines per file.
+"""
+from typing import Tuple, Text, List, Dict
+import re
+
+
+class GitDiffSyntaxError(Exception):
+  """GitDiffSyntaxError is raised when there are formatting issues in the `git diff`
+  command output.
+  """
+  pass
+
+
+def from_diff(git_diff: Text) -> Dict[Text, List[int]]:
+    """Converts `git diff` output into a map of files to their changed lines
+    (in the new version).
+    """
+
+    git_diff = git_diff.strip()
+
+    # check if `git diff` returned nothing, meaning no files had modified lines
+    if git_diff == '':
+        return {}
+
+    # splits the `git diff` result into per-file data
+    file_breakdown = git_diff.split('diff --git ')
+
+    # .split() returns an empty string at index 0 if the input starts with the separator
+    file_breakdown = file_breakdown[1:]
+
+    file_to_lines = {}  # type: Dict[Text, List[int]]
+    for file_diff in file_breakdown:
+        file_name, changed_lines = _parse_file_diff(file_diff)
+
+        # this might happen if only lines were deleted but none were added
+        if not changed_lines:
+            continue
+
+        file_to_lines[file_name] = sorted(changed_lines)
+
+    return file_to_lines
+
+
+def _parse_file_diff(file_diff: Text) -> Tuple[Text, List[int]]:
+    """Takes in a section of the `git diff` command that contains
+    all the information about a given file and returns the file's name and it's associated
+    changed lines.
+
+    Raises:
+        GitDiffSyntaxError if there are formatting issues parsing file name in `file_diff`.
+    """
+
+    first_line = file_diff.split('\n')[0]
+
+    m = re.search(r'a/(\S+) b/\S+', first_line)
+    if not m:
+        raise GitDiffSyntaxError('Expected first line to contain `a/fileName b/fileName`,' + \
+                                 'got: ' + first_line)
+
+    file_name = m.group(1)
+
+    hunks = file_diff.split('\n@@ ')[1:]
+
+    changed_lines = []
+
+    for hunk in hunks:
+        changed_lines += _changed_lines_in_hunk(hunk)
+
+    return (file_name, changed_lines)
+
+
+def _changed_lines_in_hunk(hunk: Text) -> List[int]:
+    """Returns the added/modified line numbers in a hunk (in the new version).
+
+    Raises:
+        GitDiffSyntaxError if there are formatting issues with the first line
+            of `hunk`.
+    """
+    lines = hunk.split('\n')
+    changed_lines = []
+
+    m = re.search(r'-\d+,\d+ \+(\d+),\d+ @@', lines[0])
+    if not m:
+        raise GitDiffSyntaxError('Expected first line in hunk to be in the format:' + \
+                                 '\n\t-#,# +#,# @@\ngot: ' + lines[0])
+
+    starting_line_number = int(m.group(1))
+
+    curr_line = starting_line_number
+
+    for line in lines[1:]:
+        if line == '' or line[0] == '-':
+            continue
+
+        if line[0] == '+':
+            changed_lines.append(curr_line)
+
+        curr_line += 1
+
+
+    return sorted(changed_lines)
\ No newline at end of file
diff --git a/lcov_parser/changed_lines_test.py b/lcov_parser/changed_lines_test.py
new file mode 100755
index 0000000..9c2612b
--- /dev/null
+++ b/lcov_parser/changed_lines_test.py
@@ -0,0 +1,94 @@
+#!/usr/bin/python3
+
+import unittest
+import changed_lines
+from io import StringIO
+
+
+class GetChangedLinesTest(unittest.TestCase):
+
+	def test_normal_output(self):
+		want_lines = {
+			'file1.txt': [22, 23, 46, 47, 48],
+			'file2.txt': [6]
+		}
+
+		lines = changed_lines.from_diff(_TEST_FILE_DATA.strip())
+
+		self.assertEqual(lines, want_lines)
+
+	def test_empty_input(self):
+		self.assertFalse(changed_lines.from_diff(''))
+
+	def test_added_file(self):
+		want_lines = {
+			'addedFile': [1,2,3,4]
+		}
+
+		lines = changed_lines.from_diff(_TEST_FILE_DATA_ADDED.strip())
+
+		self.assertEqual(lines, want_lines)
+
+	def test_removed_file(self):
+		lines = changed_lines.from_diff(_TEST_FILE_DATA_DELETED.strip())
+		self.assertFalse(lines)
+
+
+_TEST_FILE_DATA = """
+diff --git a/file1.txt b/file1.txt
+index 170f11f..041325e 100755
+--- a/file1.txt
++++ b/file1.txt
+@@ -19,10 +19,10 @@
+ line 19
+ line 20
+ line 21
+-line 22
++#line 22
+-line 23
++#line 23
+
+@@ -45,8 +45,15 @@
+ 
+-echo
++#echo
++omg
++wow
+diff --git a/file2.txt b/file2.txt
+index 6675c3b..e69b66c 100644
+--- a/file2.txt
++++ b/file2.txt
+@@ -4,25 +4,28 @@
+'''some comment went here'''
+ 
+-from typing import Dict
++from typing import Dict, Text,
+"""
+
+_TEST_FILE_DATA_DELETED = """
+diff --git a/deletedFile b/deletedFile
+deleted file mode 100644
+index 005304205482..000000000000
+--- a/deletedFile
++++ /dev/null
+@@ -1,84 +0,0 @@
+-deleted
+-lines
+"""
+
+_TEST_FILE_DATA_ADDED = """
+diff --git a/addedFile b/addedFile
+new file mode 100644
+index 000000000000..d906e785c4bd
+--- /dev/null
++++ b/addedFile
+@@ -0,0 +1,5 @@
++extra
++lines
++are
++added
+"""
+
+
+if __name__ == '__main__':
+	unittest.main()