blob: 1281248734fb03eefa15b11c713b65426a464d57 [file] [log] [blame]
"""This module parses `git diff` output to determine the added/changed
lines per file.
"""
from typing import Tuple, Text, List, Dict
import re
class GitDiffSyntaxError(Exception):
"""GitDiffSyntaxError is raised when there are formatting issues in the `git diff`
command output.
"""
pass
def from_diff(git_diff: Text) -> Dict[Text, List[int]]:
"""Converts `git diff` output into a map of files to their changed lines
(in the new version).
"""
git_diff = git_diff.strip()
# check if `git diff` returned nothing, meaning no files had modified lines
if git_diff == '':
return {}
# splits the `git diff` result into per-file data
file_breakdown = git_diff.split('diff --git ')
# .split() returns an empty string at index 0 if the input starts with the separator
file_breakdown = file_breakdown[1:]
file_to_lines = {} # type: Dict[Text, List[int]]
for file_diff in file_breakdown:
file_name, changed_lines = _parse_file_diff(file_diff)
# this might happen if only lines were deleted but none were added
if not changed_lines:
continue
file_to_lines[file_name] = sorted(changed_lines)
return file_to_lines
def _parse_file_diff(file_diff: Text) -> Tuple[Text, List[int]]:
"""Takes in a section of the `git diff` command that contains
all the information about a given file and returns the file's name and it's associated
changed lines.
Raises:
GitDiffSyntaxError if there are formatting issues parsing file name in `file_diff`.
"""
first_line = file_diff.split('\n')[0]
m = re.search(r'a/(\S+) b/\S+', first_line)
if not m:
raise GitDiffSyntaxError('Expected first line to contain `a/fileName b/fileName`,' + \
'got: ' + first_line)
file_name = m.group(1)
hunks = file_diff.split('\n@@ ')[1:]
changed_lines = []
for hunk in hunks:
changed_lines += _changed_lines_in_hunk(hunk)
return (file_name, changed_lines)
def _changed_lines_in_hunk(hunk: Text) -> List[int]:
"""Returns the added/modified line numbers in a hunk (in the new version).
Raises:
GitDiffSyntaxError if there are formatting issues with the first line
of `hunk`.
"""
lines = hunk.split('\n')
changed_lines = []
m = re.search(r'-\d+,\d+ \+(\d+),\d+ @@', lines[0])
if not m:
raise GitDiffSyntaxError('Expected first line in hunk to be in the format:' + \
'\n\t-#,# +#,# @@\ngot: ' + lines[0])
starting_line_number = int(m.group(1))
curr_line = starting_line_number
for line in lines[1:]:
if line == '' or line[0] == '-':
continue
if line[0] == '+':
changed_lines.append(curr_line)
curr_line += 1
return sorted(changed_lines)