xref: /aosp_15_r20/external/toolchain-utils/llvm_tools/revert_checker.py (revision 760c253c1ed00ce9abd48f8546f08516e57485fe)
1*760c253cSXin Li#!/usr/bin/env python3
2*760c253cSXin Li# -*- coding: utf-8 -*-
3*760c253cSXin Li# ===----------------------------------------------------------------------===##
4*760c253cSXin Li#
5*760c253cSXin Li# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6*760c253cSXin Li# See https://llvm.org/LICENSE.txt for license information.
7*760c253cSXin Li# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8*760c253cSXin Li#
9*760c253cSXin Li# ===----------------------------------------------------------------------===##
10*760c253cSXin Li#
11*760c253cSXin Li# !!!!!!!!!!!! NOTE !!!!!!!!!!!!
12*760c253cSXin Li# This is copied directly from upstream LLVM. Please make any changes upstream,
13*760c253cSXin Li# rather than to this file directly. Once changes are made there, you're free
14*760c253cSXin Li# to integrate them here.
15*760c253cSXin Li
16*760c253cSXin Li"""Checks for reverts of commits across a given git commit.
17*760c253cSXin Li
18*760c253cSXin LiTo clarify the meaning of 'across' with an example, if we had the following
19*760c253cSXin Licommit history (where `a -> b` notes that `b` is a direct child of `a`):
20*760c253cSXin Li
21*760c253cSXin Li123abc -> 223abc -> 323abc -> 423abc -> 523abc
22*760c253cSXin Li
23*760c253cSXin LiAnd where 423abc is a revert of 223abc, this revert is considered to be 'across'
24*760c253cSXin Li323abc. More generally, a revert A of a parent commit B is considered to be
25*760c253cSXin Li'across' a commit C if C is a parent of A and B is a parent of C.
26*760c253cSXin Li
27*760c253cSXin LiPlease note that revert detection in general is really difficult, since merge
28*760c253cSXin Liconflicts/etc always introduce _some_ amount of fuzziness. This script just
29*760c253cSXin Liuses a bundle of heuristics, and is bound to ignore / incorrectly flag some
30*760c253cSXin Lireverts. The hope is that it'll easily catch the vast majority (>90%) of them,
31*760c253cSXin Lithough.
32*760c253cSXin Li
33*760c253cSXin LiThis is designed to be used in one of two ways: an import in Python, or run
34*760c253cSXin Lidirectly from a shell. If you want to import this, the `find_reverts`
35*760c253cSXin Lifunction is the thing to look at. If you'd rather use this from a shell, have a
36*760c253cSXin Liusage example:
37*760c253cSXin Li
38*760c253cSXin Li```
39*760c253cSXin Li./revert_checker.py c47f97169 origin/main origin/release/12.x
40*760c253cSXin Li```
41*760c253cSXin Li
42*760c253cSXin LiThis checks for all reverts from the tip of origin/main to c47f97169, which are
43*760c253cSXin Liacross the latter. It then does the same for origin/release/12.x to c47f97169.
44*760c253cSXin LiDuplicate reverts discovered when walking both roots (origin/main and
45*760c253cSXin Liorigin/release/12.x) are deduplicated in output.
46*760c253cSXin Li"""
47*760c253cSXin Li
48*760c253cSXin Liimport argparse
49*760c253cSXin Liimport collections
50*760c253cSXin Liimport logging
51*760c253cSXin Liimport re
52*760c253cSXin Liimport subprocess
53*760c253cSXin Liimport sys
54*760c253cSXin Lifrom typing import Generator, Iterable, List, NamedTuple
55*760c253cSXin Li
56*760c253cSXin Li
57*760c253cSXin Liassert sys.version_info >= (3, 6), "Only Python 3.6+ is supported."
58*760c253cSXin Li
59*760c253cSXin Li# People are creative with their reverts, and heuristics are a bit difficult.
60*760c253cSXin Li# Like 90% of of reverts have "This reverts commit ${full_sha}".
61*760c253cSXin Li# Some lack that entirely, while others have many of them specified in ad-hoc
62*760c253cSXin Li# ways, while others use short SHAs and whatever.
63*760c253cSXin Li#
64*760c253cSXin Li# The 90% case is trivial to handle (and 100% free + automatic). The extra 10%
65*760c253cSXin Li# starts involving human intervention, which is probably not worth it for now.
66*760c253cSXin Li
67*760c253cSXin Li
68*760c253cSXin Lidef _try_parse_reverts_from_commit_message(commit_message: str) -> List[str]:
69*760c253cSXin Li    if not commit_message:
70*760c253cSXin Li        return []
71*760c253cSXin Li
72*760c253cSXin Li    results = re.findall(
73*760c253cSXin Li        r"This reverts commit ([a-f0-9]{40})\b", commit_message
74*760c253cSXin Li    )
75*760c253cSXin Li
76*760c253cSXin Li    first_line = commit_message.splitlines()[0]
77*760c253cSXin Li    initial_revert = re.match(r'Revert ([a-f0-9]{6,}) "', first_line)
78*760c253cSXin Li    if initial_revert:
79*760c253cSXin Li        results.append(initial_revert.group(1))
80*760c253cSXin Li    return results
81*760c253cSXin Li
82*760c253cSXin Li
83*760c253cSXin Lidef _stream_stdout(command: List[str]) -> Generator[str, None, None]:
84*760c253cSXin Li    with subprocess.Popen(
85*760c253cSXin Li        command, stdout=subprocess.PIPE, encoding="utf-8", errors="replace"
86*760c253cSXin Li    ) as p:
87*760c253cSXin Li        assert p.stdout is not None  # for mypy's happiness.
88*760c253cSXin Li        yield from p.stdout
89*760c253cSXin Li
90*760c253cSXin Li
91*760c253cSXin Lidef _resolve_sha(git_dir: str, sha: str) -> str:
92*760c253cSXin Li    if len(sha) == 40:
93*760c253cSXin Li        return sha
94*760c253cSXin Li
95*760c253cSXin Li    return subprocess.check_output(
96*760c253cSXin Li        ["git", "-C", git_dir, "rev-parse", sha],
97*760c253cSXin Li        encoding="utf-8",
98*760c253cSXin Li        stderr=subprocess.DEVNULL,
99*760c253cSXin Li    ).strip()
100*760c253cSXin Li
101*760c253cSXin Li
102*760c253cSXin Li_LogEntry = NamedTuple(
103*760c253cSXin Li    "_LogEntry",
104*760c253cSXin Li    [
105*760c253cSXin Li        ("sha", str),
106*760c253cSXin Li        ("commit_message", str),
107*760c253cSXin Li    ],
108*760c253cSXin Li)
109*760c253cSXin Li
110*760c253cSXin Li
111*760c253cSXin Lidef _log_stream(
112*760c253cSXin Li    git_dir: str, root_sha: str, end_at_sha: str
113*760c253cSXin Li) -> Iterable[_LogEntry]:
114*760c253cSXin Li    sep = 50 * "<>"
115*760c253cSXin Li    log_command = [
116*760c253cSXin Li        "git",
117*760c253cSXin Li        "-C",
118*760c253cSXin Li        git_dir,
119*760c253cSXin Li        "log",
120*760c253cSXin Li        "^" + end_at_sha,
121*760c253cSXin Li        root_sha,
122*760c253cSXin Li        "--format=" + sep + "%n%H%n%B%n",
123*760c253cSXin Li    ]
124*760c253cSXin Li
125*760c253cSXin Li    stdout_stream = iter(_stream_stdout(log_command))
126*760c253cSXin Li
127*760c253cSXin Li    # Find the next separator line. If there's nothing to log, it may not exist.
128*760c253cSXin Li    # It might not be the first line if git feels complainy.
129*760c253cSXin Li    found_commit_header = False
130*760c253cSXin Li    for line in stdout_stream:
131*760c253cSXin Li        if line.rstrip() == sep:
132*760c253cSXin Li            found_commit_header = True
133*760c253cSXin Li            break
134*760c253cSXin Li
135*760c253cSXin Li    while found_commit_header:
136*760c253cSXin Li        sha = next(stdout_stream, None)
137*760c253cSXin Li        assert sha is not None, "git died?"
138*760c253cSXin Li        sha = sha.rstrip()
139*760c253cSXin Li
140*760c253cSXin Li        commit_message = []
141*760c253cSXin Li
142*760c253cSXin Li        found_commit_header = False
143*760c253cSXin Li        for line in stdout_stream:
144*760c253cSXin Li            line = line.rstrip()
145*760c253cSXin Li            if line.rstrip() == sep:
146*760c253cSXin Li                found_commit_header = True
147*760c253cSXin Li                break
148*760c253cSXin Li            commit_message.append(line)
149*760c253cSXin Li
150*760c253cSXin Li        yield _LogEntry(sha, "\n".join(commit_message).rstrip())
151*760c253cSXin Li
152*760c253cSXin Li
153*760c253cSXin Lidef _shas_between(git_dir: str, base_ref: str, head_ref: str) -> Iterable[str]:
154*760c253cSXin Li    rev_list = [
155*760c253cSXin Li        "git",
156*760c253cSXin Li        "-C",
157*760c253cSXin Li        git_dir,
158*760c253cSXin Li        "rev-list",
159*760c253cSXin Li        "--first-parent",
160*760c253cSXin Li        f"{base_ref}..{head_ref}",
161*760c253cSXin Li    ]
162*760c253cSXin Li    return (x.strip() for x in _stream_stdout(rev_list))
163*760c253cSXin Li
164*760c253cSXin Li
165*760c253cSXin Lidef _rev_parse(git_dir: str, ref: str) -> str:
166*760c253cSXin Li    return subprocess.check_output(
167*760c253cSXin Li        ["git", "-C", git_dir, "rev-parse", ref],
168*760c253cSXin Li        encoding="utf-8",
169*760c253cSXin Li    ).strip()
170*760c253cSXin Li
171*760c253cSXin Li
172*760c253cSXin LiRevert = NamedTuple(
173*760c253cSXin Li    "Revert",
174*760c253cSXin Li    [
175*760c253cSXin Li        ("sha", str),
176*760c253cSXin Li        ("reverted_sha", str),
177*760c253cSXin Li    ],
178*760c253cSXin Li)
179*760c253cSXin Li
180*760c253cSXin Li
181*760c253cSXin Lidef _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str:
182*760c253cSXin Li    """Finds the closest common parent commit between `ref_a` and `ref_b`."""
183*760c253cSXin Li    return subprocess.check_output(
184*760c253cSXin Li        ["git", "-C", git_dir, "merge-base", ref_a, ref_b],
185*760c253cSXin Li        encoding="utf-8",
186*760c253cSXin Li    ).strip()
187*760c253cSXin Li
188*760c253cSXin Li
189*760c253cSXin Lidef find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]:
190*760c253cSXin Li    """Finds reverts across `across_ref` in `git_dir`, starting from `root`.
191*760c253cSXin Li
192*760c253cSXin Li    These reverts are returned in order of oldest reverts first.
193*760c253cSXin Li    """
194*760c253cSXin Li    across_sha = _rev_parse(git_dir, across_ref)
195*760c253cSXin Li    root_sha = _rev_parse(git_dir, root)
196*760c253cSXin Li
197*760c253cSXin Li    common_ancestor = _find_common_parent_commit(git_dir, across_sha, root_sha)
198*760c253cSXin Li    if common_ancestor != across_sha:
199*760c253cSXin Li        raise ValueError(
200*760c253cSXin Li            f"{across_sha} isn't an ancestor of {root_sha} "
201*760c253cSXin Li            "(common ancestor: {common_ancestor})"
202*760c253cSXin Li        )
203*760c253cSXin Li
204*760c253cSXin Li    intermediate_commits = set(_shas_between(git_dir, across_sha, root_sha))
205*760c253cSXin Li    assert across_sha not in intermediate_commits
206*760c253cSXin Li
207*760c253cSXin Li    logging.debug(
208*760c253cSXin Li        "%d commits appear between %s and %s",
209*760c253cSXin Li        len(intermediate_commits),
210*760c253cSXin Li        across_sha,
211*760c253cSXin Li        root_sha,
212*760c253cSXin Li    )
213*760c253cSXin Li
214*760c253cSXin Li    all_reverts = []
215*760c253cSXin Li    for sha, commit_message in _log_stream(git_dir, root_sha, across_sha):
216*760c253cSXin Li        reverts = _try_parse_reverts_from_commit_message(commit_message)
217*760c253cSXin Li        if not reverts:
218*760c253cSXin Li            continue
219*760c253cSXin Li
220*760c253cSXin Li        resolved_reverts = sorted(
221*760c253cSXin Li            set(_resolve_sha(git_dir, x) for x in reverts)
222*760c253cSXin Li        )
223*760c253cSXin Li        for reverted_sha in resolved_reverts:
224*760c253cSXin Li            if reverted_sha in intermediate_commits:
225*760c253cSXin Li                logging.debug(
226*760c253cSXin Li                    "Commit %s reverts %s, which happened after %s",
227*760c253cSXin Li                    sha,
228*760c253cSXin Li                    reverted_sha,
229*760c253cSXin Li                    across_sha,
230*760c253cSXin Li                )
231*760c253cSXin Li                continue
232*760c253cSXin Li
233*760c253cSXin Li            try:
234*760c253cSXin Li                object_type = subprocess.check_output(
235*760c253cSXin Li                    ["git", "-C", git_dir, "cat-file", "-t", reverted_sha],
236*760c253cSXin Li                    encoding="utf-8",
237*760c253cSXin Li                    stderr=subprocess.DEVNULL,
238*760c253cSXin Li                ).strip()
239*760c253cSXin Li            except subprocess.CalledProcessError:
240*760c253cSXin Li                logging.warning(
241*760c253cSXin Li                    "Failed to resolve reverted object %s (claimed to be reverted "
242*760c253cSXin Li                    "by sha %s)",
243*760c253cSXin Li                    reverted_sha,
244*760c253cSXin Li                    sha,
245*760c253cSXin Li                )
246*760c253cSXin Li                continue
247*760c253cSXin Li
248*760c253cSXin Li            if object_type == "commit":
249*760c253cSXin Li                all_reverts.append(Revert(sha, reverted_sha))
250*760c253cSXin Li                continue
251*760c253cSXin Li
252*760c253cSXin Li            logging.error(
253*760c253cSXin Li                "%s claims to revert %s -- which isn't a commit -- %s",
254*760c253cSXin Li                sha,
255*760c253cSXin Li                object_type,
256*760c253cSXin Li                reverted_sha,
257*760c253cSXin Li            )
258*760c253cSXin Li
259*760c253cSXin Li    # Since `all_reverts` contains reverts in log order (e.g., newer comes before
260*760c253cSXin Li    # older), we need to reverse this to keep with our guarantee of older =
261*760c253cSXin Li    # earlier in the result.
262*760c253cSXin Li    all_reverts.reverse()
263*760c253cSXin Li    return all_reverts
264*760c253cSXin Li
265*760c253cSXin Li
266*760c253cSXin Lidef _main() -> None:
267*760c253cSXin Li    parser = argparse.ArgumentParser(
268*760c253cSXin Li        description=__doc__,
269*760c253cSXin Li        formatter_class=argparse.RawDescriptionHelpFormatter,
270*760c253cSXin Li    )
271*760c253cSXin Li    parser.add_argument(
272*760c253cSXin Li        "base_ref", help="Git ref or sha to check for reverts around."
273*760c253cSXin Li    )
274*760c253cSXin Li    parser.add_argument(
275*760c253cSXin Li        "-C", "--git_dir", default=".", help="Git directory to use."
276*760c253cSXin Li    )
277*760c253cSXin Li    parser.add_argument(
278*760c253cSXin Li        "root", nargs="+", help="Root(s) to search for commits from."
279*760c253cSXin Li    )
280*760c253cSXin Li    parser.add_argument("--debug", action="store_true")
281*760c253cSXin Li    parser.add_argument(
282*760c253cSXin Li        "-u",
283*760c253cSXin Li        "--review_url",
284*760c253cSXin Li        action="store_true",
285*760c253cSXin Li        help="Format SHAs as llvm review URLs",
286*760c253cSXin Li    )
287*760c253cSXin Li    opts = parser.parse_args()
288*760c253cSXin Li
289*760c253cSXin Li    logging.basicConfig(
290*760c253cSXin Li        format="%(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: %(message)s",
291*760c253cSXin Li        level=logging.DEBUG if opts.debug else logging.INFO,
292*760c253cSXin Li    )
293*760c253cSXin Li
294*760c253cSXin Li    # `root`s can have related history, so we want to filter duplicate commits
295*760c253cSXin Li    # out. The overwhelmingly common case is also to have one root, and it's way
296*760c253cSXin Li    # easier to reason about output that comes in an order that's meaningful to
297*760c253cSXin Li    # git.
298*760c253cSXin Li    seen_reverts = set()
299*760c253cSXin Li    all_reverts = []
300*760c253cSXin Li    for root in opts.root:
301*760c253cSXin Li        for revert in find_reverts(opts.git_dir, opts.base_ref, root):
302*760c253cSXin Li            if revert not in seen_reverts:
303*760c253cSXin Li                seen_reverts.add(revert)
304*760c253cSXin Li                all_reverts.append(revert)
305*760c253cSXin Li
306*760c253cSXin Li    for revert in all_reverts:
307*760c253cSXin Li        sha_fmt = (
308*760c253cSXin Li            f"https://reviews.llvm.org/rG{revert.sha}"
309*760c253cSXin Li            if opts.review_url
310*760c253cSXin Li            else revert.sha
311*760c253cSXin Li        )
312*760c253cSXin Li        reverted_sha_fmt = (
313*760c253cSXin Li            f"https://reviews.llvm.org/rG{revert.reverted_sha}"
314*760c253cSXin Li            if opts.review_url
315*760c253cSXin Li            else revert.reverted_sha
316*760c253cSXin Li        )
317*760c253cSXin Li        print(f"{sha_fmt} claims to revert {reverted_sha_fmt}")
318*760c253cSXin Li
319*760c253cSXin Li
320*760c253cSXin Liif __name__ == "__main__":
321*760c253cSXin Li    _main()
322