gertty/gertty/gitrepo.py
James E. Blair cd2538e04a Fix tab expansion in inter-patchset diff
There are some lines which were being read as byte arrays and
were not decoded into strings which was causing them to bypass
tab expansion (because they caused an exception which was caught
and the un-expanded text was used instead).  Correct those cases
and disable the exception handler so that we can identify any
other cases by triggering a crash.

Change-Id: I4b34132290fc8611cc770d572068197b33c0789c
2019-03-28 08:03:42 -07:00

576 lines
21 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2014 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# Test changes:
# https://review.openstack.org/275862
# https://review.openstack.org/119302
# https://review.openstack.org/133550
import datetime
import logging
import difflib
import itertools
import os
import re
import git
import gitdb
import six
OLD = 0
NEW = 1
START = 0
END = 1
LINENO = 0
LINE = 1
class GitTimeZone(datetime.tzinfo):
"""Because we can't have nice things."""
def __init__(self, offset_seconds):
self._offset = offset_seconds
def utcoffset(self, dt):
return datetime.timedelta(seconds=self._offset)
def dst(self, dt):
return datetime.timedelta(0)
def tzname(self, dt):
return None
class CommitBlob(object):
def __init__(self):
self.path = '/COMMIT_MSG'
class CommitContext(object):
"""A git.diff.Diff for commit messages."""
def decorateGitTime(self, seconds, tz):
dt = datetime.datetime.fromtimestamp(seconds, GitTimeZone(-tz))
return dt.strftime('%Y-%m-%d %H:%M:%S %Z%z')
def decorateMessage(self, commit):
"""Put the Gerrit commit metadata at the front of the message.
e.g.:
Parent: cc8a51ca (Initial commit) 1
Author: Robert Collins <rbtcollins@hp.com> 2
AuthorDate: 2014-05-27 14:05:47 +1200 3
Commit: Robert Collins <rbtcollins@hp.com> 4
CommitDate: 2014-05-27 14:07:57 +1200 5
6
"""
# NB: If folk report that commits have comments at the wrong place
# Then this function, which reproduces gerrit behaviour, will need
# to be fixed (e.g. by making the behaviour match more closely.
if not commit:
return []
if commit.parents:
parentsha = commit.parents[0].hexsha[:8]
else:
parentsha = None
author = commit.author
committer = commit.committer
author_date = self.decorateGitTime(
commit.authored_date, commit.author_tz_offset)
commit_date = self.decorateGitTime(
commit.committed_date, commit.committer_tz_offset)
if isinstance(author.email, six.text_type):
author_email = author.email
else:
author_email = author.email.decode('utf8')
if isinstance(committer.email, six.text_type):
committer_email = committer.email
else:
committer_email = committer.email.decode('utf8')
return [u"Parent: %s\n" % parentsha,
u"Author: %s <%s>\n" % (author.name, author_email),
u"AuthorDate: %s\n" % author_date,
u"Commit: %s <%s>\n" % (committer.name, committer_email),
u"CommitDate: %s\n" % commit_date,
u"\n"] + commit.message.splitlines(True)
def __init__(self, old, new):
"""Create a CommitContext.
:param old: A git.objects.commit object or None.
:param new: A git.objects.commit object.
"""
self.rename_from = self.rename_to = None
if old is None:
self.new_file = True
else:
self.new_file = False
self.deleted_file = False
self.a_blob = CommitBlob()
self.b_blob = CommitBlob()
self.a_path = self.a_blob.path
self.b_path = self.b_blob.path
self.diff = ''.join(difflib.unified_diff(
self.decorateMessage(old), self.decorateMessage(new),
fromfile="/a/COMMIT_MSG", tofile="/b/COMMIT_MSG"))
class DiffChunk(object):
def __init__(self):
self.oldlines = []
self.newlines = []
self.first = False
self.last = False
self.lines = []
self.calcRange()
def __repr__(self):
return '<%s old lines %s-%s / new lines %s-%s>' % (
self.__class__.__name__,
self.range[OLD][START], self.range[OLD][END],
self.range[NEW][START], self.range[NEW][END])
def calcRange(self):
self.range = [[0, 0],
[0, 0]]
for l in self.lines:
if self.range[OLD][START] == 0 and l[OLD][LINENO] is not None:
self.range[OLD][START] = l[OLD][LINENO]
if self.range[NEW][START] == 0 and l[NEW][LINENO] is not None:
self.range[NEW][START] = l[NEW][LINENO]
if (self.range[OLD][START] != 0 and
self.range[NEW][START] != 0):
break
for l in self.lines[::-1]:
if self.range[OLD][END] == 0 and l[OLD][LINENO] is not None:
self.range[OLD][END] = l[OLD][LINENO]
if self.range[NEW][END] == 0 and l[NEW][LINENO] is not None:
self.range[NEW][END] = l[NEW][LINENO]
if (self.range[OLD][END] != 0 and
self.range[NEW][END] != 0):
break
def indexOfLine(self, oldnew, lineno):
for i, l in enumerate(self.lines):
if l[oldnew][LINENO] == lineno:
return i
class DiffContextChunk(DiffChunk):
context = True
class DiffChangedChunk(DiffChunk):
context = False
class DiffFile(object):
log = logging.getLogger('gertty.gitrepo')
def __init__(self):
self.newname = 'Unknown File'
self.oldname = 'Unknown File'
self.old_empty = False
self.new_empty = False
self.chunks = []
self.current_chunk = None
self.old_lineno = 0
self.new_lineno = 0
self.offset = 0
def finalize(self):
if not self.current_chunk:
return
oldlines = [(n, d, self.expand_tabs(l)) for (n, d, l)
in self.current_chunk.oldlines]
newlines = [(n, d, self.expand_tabs(l)) for (n, d, l)
in self.current_chunk.newlines]
self.current_chunk.lines = list(
six.moves.zip(oldlines, newlines))
if not self.chunks:
self.current_chunk.first = True
else:
self.chunks[-1].last = False
self.current_chunk.last = True
self.current_chunk.calcRange()
self.chunks.append(self.current_chunk)
self.current_chunk = None
def expand_tabs(self, l, tabstop = 8):
offset = { 'start': 0, 'prevstart': 0 }
def replace(match):
offset['start'] += match.start(0) - offset['prevstart']
offset['prevstart'] = match.start(0)
cnt = tabstop - offset['start'] % tabstop - 1
offset['start'] += cnt
return "»" + " " * cnt
try:
if isinstance(l, six.string_types):
return re.sub(r'\t', replace, l)
elif isinstance(l, list):
return [self.expand_tabs(e) for e in l]
else:
(a, b) = l
return (a, re.sub(r'\t', replace, b))
except Exception:
self.log.exception("Error expanding tabs")
return l
def addDiffLines(self, old, new):
if (self.current_chunk and
not isinstance(self.current_chunk, DiffChangedChunk)):
self.finalize()
if not self.current_chunk:
self.current_chunk = DiffChangedChunk()
for l in old:
self.current_chunk.oldlines.append((self.old_lineno, '-', l))
self.old_lineno += 1
self.offset -= 1
for l in new:
self.current_chunk.newlines.append((self.new_lineno, '+', l))
self.new_lineno += 1
self.offset += 1
while self.offset > 0:
self.current_chunk.oldlines.append((None, '', ''))
self.offset -= 1
while self.offset < 0:
self.current_chunk.newlines.append((None, '', ''))
self.offset += 1
def addNewLine(self, line):
if (self.current_chunk and
not isinstance(self.current_chunk, DiffChangedChunk)):
self.finalize()
if not self.current_chunk:
self.current_chunk = DiffChangedChunk()
def addContextLine(self, line):
if (self.current_chunk and
not isinstance(self.current_chunk, DiffContextChunk)):
self.finalize()
if not self.current_chunk:
self.current_chunk = DiffContextChunk()
self.current_chunk.oldlines.append((self.old_lineno, ' ', line))
self.current_chunk.newlines.append((self.new_lineno, ' ', line))
self.old_lineno += 1
self.new_lineno += 1
class GitCheckoutError(Exception):
def __init__(self, msg):
super(GitCheckoutError, self).__init__(msg)
self.msg = msg
class GitCloneError(Exception):
def __init__(self, msg):
super(GitCloneError, self).__init__(msg)
self.msg = msg
class Repo(object):
def __init__(self, url, path):
self.log = logging.getLogger('gertty.gitrepo')
self.url = url
self.path = path
self.differ = difflib.Differ()
if not os.path.exists(path):
if url is None:
raise GitCloneError("No URL available for git clone")
git.Repo.clone_from(self.url, self.path)
def checkCommits(self, shas):
invalid = set()
repo = git.Repo(self.path)
for sha in shas:
try:
repo.commit(sha)
except gitdb.exc.BadObject:
invalid.add(sha)
except ValueError:
invalid.add(sha)
return invalid
def fetch(self, url, refspec):
repo = git.Repo(self.path)
try:
repo.git.fetch(url, refspec)
except AssertionError:
repo.git.fetch(url, refspec)
def deleteRef(self, ref):
repo = git.Repo(self.path)
git.Reference.delete(repo, ref)
def checkout(self, ref):
repo = git.Repo(self.path)
try:
repo.git.checkout(ref)
except git.exc.GitCommandError as e:
raise GitCheckoutError(e.stderr.replace('\t', ' '))
def cherryPick(self, ref):
repo = git.Repo(self.path)
try:
repo.git.cherry_pick(ref)
except git.exc.GitCommandError as e:
raise GitCheckoutError(e.stderr.replace('\t', ' '))
def diffstat(self, old, new):
repo = git.Repo(self.path)
diff = repo.git.diff('-M', '--color=never', '--numstat', old, new)
ret = []
for x in diff.split('\n'):
# Added, removed, filename
ret.append(x.split('\t'))
return ret
trailing_ws_re = re.compile('\s+$')
def _emph_trail_ws(self, style, line):
result = (style, line)
re_result = self.trailing_ws_re.search(line)
if (re_result):
span = re_result.span()
if len(line[:span[0]]) == 0:
ws_line = ('trailing-ws', line)
else:
ws_line = [(style, line[:span[0]]),
('trailing-ws', line[span[0]:span[1]])]
result = ws_line
return result
def intralineDiff(self, old, new):
# takes a list of old lines and a list of new lines
prevline = None
prevstyle = None
output_old = []
output_new = []
#self.log.debug('startold' + repr(old))
#self.log.debug('startnew' + repr(new))
for line in self.differ.compare(old, new):
#self.log.debug('diff output: ' + line)
key = line[0]
rest = line[2:]
if key == '?':
result = []
accumulator = ''
emphasis = False
rest = rest[:-1] # It has a newline.
for i, c in enumerate(prevline):
if i >= len(rest):
indicator = ' '
else:
indicator = rest[i]
#self.log.debug('%s %s %s %s %s' % (i, c, indicator, emphasis, accumulator))
if indicator != ' ' and not emphasis:
# changing from not emph to emph
if accumulator:
result.append((prevstyle+'-line', accumulator))
accumulator = ''
emphasis = True
elif indicator == ' ' and emphasis:
# changing from emph to not emph
if accumulator:
result.append((prevstyle+'-word', accumulator))
accumulator = ''
emphasis = False
accumulator += c
if accumulator:
if emphasis:
result.append(self._emph_trail_ws(prevstyle+'-word',
accumulator))
else:
result.append(self._emph_trail_ws(prevstyle+'-line',
accumulator))
if prevstyle == 'added':
output_new.append(result)
elif prevstyle == 'removed':
output_old.append(result)
prevline = None
continue
if prevline is not None:
if prevstyle == 'added' or prevstyle == 'context':
output_new.append(self._emph_trail_ws(prevstyle+'-line',
prevline))
if prevstyle == 'removed' or prevstyle == 'context':
output_old.append((prevstyle+'-line', prevline))
if key == '+':
prevstyle = 'added'
elif key == '-':
prevstyle = 'removed'
elif key == ' ':
prevstyle = 'context'
prevline = rest
#self.log.debug('prev'+repr(prevline))
if prevline is not None:
if prevstyle == 'added':
output_new.append(self._emph_trail_ws(prevstyle+'-line',
prevline))
elif prevstyle == 'removed':
output_old.append((prevstyle+'-line', prevline))
#self.log.debug(repr(output_old))
#self.log.debug(repr(output_new))
return output_old, output_new
header_re = re.compile('@@ -(\d+)(,\d+)? \+(\d+)(,\d+)? @@')
def diff(self, old, new, context=10000, show_old_commit=False):
"""Create a diff from old to new.
Note that the commit message is also diffed, and listed as /COMMIT_MSG.
"""
repo = git.Repo(self.path)
#'-y', '-x', 'diff -C10', old, new, path).split('\n'):
oldc = repo.commit(old)
newc = repo.commit(new)
files = []
extra_contexts = []
if show_old_commit:
extra_contexts.append(CommitContext(oldc, newc))
else:
extra_contexts.append(CommitContext(None, newc))
contexts = itertools.chain(
extra_contexts, oldc.diff(
newc, color='never', create_patch=True, unified=context))
for diff_context in contexts:
# Each iteration of this is a file
f = DiffFile()
f.oldname = diff_context.a_path
f.newname = diff_context.b_path
if diff_context.new_file:
f.oldname = 'Empty file'
f.old_empty = True
if diff_context.deleted_file:
f.newname = 'Empty file'
f.new_empty = True
files.append(f)
if diff_context.rename_from:
f.oldname = diff_context.rename_from
if diff_context.rename_to:
f.newname = diff_context.rename_to
oldchunk = []
newchunk = []
prev_key = ''
if isinstance(diff_context.diff, six.string_types):
diff_text = diff_context.diff
else:
diff_text = diff_context.diff.decode('utf-8')
diff_lines = diff_text.split('\n')
for i, line in enumerate(diff_lines):
last_line = (i == len(diff_lines)-1)
if line.startswith('---'):
continue
if line.startswith('+++'):
continue
if line.startswith('@@'):
#socket.sendall(line)
m = self.header_re.match(line)
#socket.sendall(str(m.groups()))
f.old_lineno = int(m.group(1))
f.new_lineno = int(m.group(3))
continue
if not line:
if prev_key != '\\':
# Strangely, we get an extra newline in the
# diff in the case that the last line is "\ No
# newline at end of file". This is a
# workaround for that.
prev_key = ''
line = 'X '
else:
line = ' '
key = line[0]
rest = line[1:]
if key == '\\':
# This is for "\ No newline at end of file" which
# follows either a -, + or ' ' line to indicate
# which file it's talking about (or both). For
# now, treat it like normal text and let the user
# infer from context that it's not actually in the
# file. Potential TODO: highlight it to make that
# more clear.
if prev_key:
key = prev_key
else:
key = ' '
prev_key = '\\'
if key == '-':
prev_key = '-'
oldchunk.append(rest)
if not last_line:
continue
if key == '+':
prev_key = '+'
newchunk.append(rest)
if not last_line:
continue
prev_key = ''
# end of chunk
if oldchunk or newchunk:
oldchunk, newchunk = self.intralineDiff(oldchunk, newchunk)
f.addDiffLines(oldchunk, newchunk)
oldchunk = []
newchunk = []
if key == ' ':
f.addContextLine(rest)
continue
if line.startswith("similarity index"):
continue
if line.startswith("rename"):
continue
if line.startswith("index"):
continue
if line.startswith("Binary files"):
continue
if not last_line:
raise Exception("Unhandled line: %s" % line)
if not diff_context.diff:
# There is no diff, possibly because this is simply a
# rename. Include context lines so that comments may
# appear.
if not f.new_empty:
blob = newc.tree[f.newname]
else:
blob = oldc.tree[f.oldname]
f.old_lineno = 1
f.new_lineno = 1
for line in blob.data_stream.read().splitlines():
if isinstance(line, six.string_types):
f.addContextLine(line)
else:
f.addContextLine(line.decode('utf8'))
f.finalize()
return files
def getFile(self, old, new, path):
f = DiffFile()
f.oldname = path
f.newname = path
f.old_lineno = 1
f.new_lineno = 1
repo = git.Repo(self.path)
newc = repo.commit(new)
try:
blob = newc.tree[path]
except KeyError:
return None
for line in blob.data_stream.read().splitlines():
if isinstance(line, six.string_types):
f.addContextLine(line)
else:
f.addContextLine(line.decode('utf8'))
f.finalize()
return f
def get_repo(project_name, config):
local_path = os.path.join(config.git_root, project_name)
local_root = os.path.abspath(config.git_root)
assert os.path.commonprefix((local_root, local_path)) == local_root
return Repo(config.git_url + project_name, local_path)