Added lcov_parser.py to parse LCOV ".info" files.

Added a file lcov_parser.py to parse LCOV ".info" files
and convert them into a simplified form.

This parser gets rid of function-related data and only
looks at what lines are instrumented vs those that
are not.

We're hoping to use this to compute incremental coverage
for kunit.

Signed-off-by: Darya Verzhbinsky <daryaver@google.com>
Change-Id: Ic3527c8d9c646a280e20a5bee54cb264402622c0
diff --git a/BUILD.bazel b/BUILD.bazel
index 887a663..94e75a4 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -52,3 +52,15 @@
     tars = ["@kunit_repo//:files"],
     workdir = "/kunit/linux",
 )
+
+py_library(
+    name = "lcov_parser",
+    srcs = ["lcov_parser/lcov_parser.py"],
+)
+
+py_test(
+    name = "lcov_parser_test",
+    src = ["lcov_parser/lcov_parser_test.py"],
+    python_version = "PY3",
+    deps = ["lcov_parser/lcov_parser.py"],
+)
diff --git a/lcov_parser/lcov_parser.py b/lcov_parser/lcov_parser.py
new file mode 100644
index 0000000..c623efc
--- /dev/null
+++ b/lcov_parser/lcov_parser.py
@@ -0,0 +1,137 @@
+"""This file provides a parse() method to extract the per-file line coverage
+from an LCOV ".info" file.
+"""
+
+from typing import Dict, Text, Tuple, Iterable, IO, Optional
+
+
+Lines = Dict[int, bool]
+
+def _merge_lines(dst: Lines, src: Lines) -> Lines:
+  """_merge_lines merges `src` into `dst` and returns `dst`. """
+  for line, coverage in src.items():
+    if line in dst:
+      dst[line] = dst[line] or coverage
+    else:
+      dst[line] = coverage
+  return dst
+
+
+class Record:
+  """Record holds the data from an LCOV record."""
+  def __init__(self):
+    self.lines = {}  # type: Lines
+    self.file = ''
+
+
+class LcovSyntaxError(Exception):
+  """LcovSyntaxError is raised when there is formatting issues in the ".info" file."""
+  pass
+
+
+def _parse_line(line: Text) -> Tuple[Text, Text]:
+  """_parse_line takes in a line from a file and returns it in two parts.
+
+  LCOV lines are of the form
+        <field>:<data>
+          eg: "TN:<test name>"
+
+  Args:
+    line: line from the input file.
+
+  Returns:
+    a tuple containing the part before and the part after a ":" in the line.
+  """
+  try:
+    field, value = line.split(':', 1)
+  except:
+    raise LcovSyntaxError('invalid data line, needs to be of the form' +
+                          '\n\tDA:<line #>,<execution count>[,<checksum>]\n ' +
+                          'or\n \tTN:<test name>\ngot: ' + line)
+
+  return (field, value.strip())
+
+
+def _read_record(input_file: Iterable[Text]) -> Optional[Record]:
+  """_read_record the next LCOV record from `input_file`.
+
+  LCOV records are of the form
+        SF:<file name>
+        ...
+        end_of_record
+
+  Args:
+    input_file: the ".info" file that is being read.
+
+  Returns:
+    the text Record in the file.
+
+  Raises:
+    LcovSyntaxError: if the format of the input file doesn't match correct
+      LCOV format.
+  """
+  rec = Record()
+
+  for line in input_file:
+
+    line = line.strip()
+
+    if line == 'end_of_record':
+      return rec
+
+    field, value = _parse_line(line)
+
+    if field == 'SF':
+      rec.file = value
+
+    elif field == 'DA':
+      parts = value.split(',', 2)
+      if len(parts) < 2:
+        raise LcovSyntaxError('invalid data line, needs to be of the form ' +
+                              '\n\tDA:<line #>,<execution count>[,<checksum>]\ngot: ' + line)
+
+      try:
+        line_num = int(parts[0])
+      except:
+        raise LcovSyntaxError('invalid data line, needs to be of the form ' +
+                              '\n\tDA:<line #>,<execution count>[,<checksum>]\ngot: ' + line)
+
+      # LCOV explicitly reports a '0' execution count for instrumented lines that don't get run.
+      rec.lines[line_num] = parts[1] != '0'
+
+  return None
+
+
+def parse(input_file: IO) -> Tuple[Text, Dict[Text, Lines]]:
+  """parse reads an LCOV ".info" file.
+
+  Args:
+    input_file: the file that is being read.
+
+  Returns:
+    a tuple containing the test_name and the associated files w/ coverage.
+
+  Raises:
+    LcovSyntaxError: if the first line of the input file does not start with "TN".
+  """
+
+  # the first line in a LCOV report is the test name
+  field, test_name = _parse_line(input_file.readline())
+
+  if field != 'TN':
+    raise LcovSyntaxError('first line in LCOV report should be "TN:<name>", got: ' +
+                          field + ':' + test_name)
+
+  files = {}  # type: Dict[Text, Lines]
+
+  while True:
+    rec = _read_record(input_file)
+    if rec is None:
+      break
+
+    if rec.file in files:
+      files[rec.file] = _merge_lines(files[rec.file], rec.lines)
+    else:
+      files[rec.file] = rec.lines
+
+  return (test_name, files)
diff --git a/lcov_parser/lcov_parser_test.py b/lcov_parser/lcov_parser_test.py
new file mode 100755
index 0000000..c6f2155
--- /dev/null
+++ b/lcov_parser/lcov_parser_test.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python3
+
+import unittest
+import lcov_parser
+from io import StringIO
+
+
+def expected_coverage(covered, uncovered):
+	lines = {}  # Type: Dict[int, bool]
+
+	for cov in covered:
+		lines[cov] = True
+	for uncov in uncovered:
+		lines[uncov] = False
+
+	return lines
+
+
+class LcovParserTest(unittest.TestCase):
+
+	def test_small_dir_passes(self):
+		want_data = {}  # Type: Dict[Text, lcov_parser.FileData]
+
+		want_data['/kunit/test/strerror.c'] = expected_coverage([156], [154])
+		want_data['/kunit/test/strerror-test.c'] = expected_coverage([14], [16])
+
+		with StringIO(_TEST_FILE_DATA.strip()) as input:
+			test_name, coverage_data = lcov_parser.parse(input)
+			self.assertEqual(test_name, 'kunit_presubmit_tests')
+			self.assertEqual(coverage_data, want_data)
+
+	def test_raises_errors(self):
+
+		with self.assertRaises(lcov_parser.LcovSyntaxError):
+			with StringIO('') as input_data:
+				test_name, data = lcov_parser.parse(input_data)
+
+		with self.assertRaises(lcov_parser.LcovSyntaxError):
+			with StringIO('invalid_first_line\n') as input_data:
+				test_name, data = lcov_parser.parse(input_data)
+
+		with self.assertRaises(lcov_parser.LcovSyntaxError):
+			with StringIO('TN:valid_line\nDA:invalid_second,str\n') as input_data:
+				test_name, data = lcov_parser.parse(input_data)
+
+
+_TEST_FILE_DATA = """
+TN:kunit_presubmit_tests
+SF:/kunit/test/strerror.c
+DA:154,0
+DA:156,10
+end_of_record
+SF:/kunit/test/strerror-test.c
+DA:14,1
+DA:16,0
+end_of_record
+"""
+
+if __name__ == '__main__':
+	unittest.main()