xref: /aosp_15_r20/external/pytorch/scripts/release_notes/commitlist.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1import argparse
2import csv
3import dataclasses
4import os
5import pprint
6import re
7from collections import defaultdict
8from pathlib import Path
9from typing import List
10
11import common
12from common import (
13    features_to_dict,
14    frontend_categories,
15    get_commit_data_cache,
16    run,
17    topics,
18)
19
20
21"""
22Example Usages
23
24Create a new commitlist for consumption by categorize.py.
25Said commitlist contains commits between v1.5.0 and f5bc91f851.
26
27    python commitlist.py --create-new tags/v1.5.0 f5bc91f851
28
29Update the existing commitlist to commit bfcb687b9c.
30
31    python commitlist.py --update-to bfcb687b9c
32
33"""
34
35
36@dataclasses.dataclass(frozen=False)
37class Commit:
38    commit_hash: str
39    category: str
40    topic: str
41    title: str
42    files_changed: str
43    pr_link: str
44    author: str
45
46    # This is not a list so that it is easier to put in a spreadsheet
47    accepter_1: str
48    accepter_2: str
49    accepter_3: str
50
51    merge_into: str = None
52
53    def __repr__(self):
54        return (
55            f"Commit({self.commit_hash}, {self.category}, {self.topic}, {self.title})"
56        )
57
58
59commit_fields = tuple(f.name for f in dataclasses.fields(Commit))
60
61
62class CommitList:
63    # NB: Private ctor. Use `from_existing` or `create_new`.
64    def __init__(self, path: str, commits: List[Commit]):
65        self.path = path
66        self.commits = commits
67
68    @staticmethod
69    def from_existing(path):
70        commits = CommitList.read_from_disk(path)
71        return CommitList(path, commits)
72
73    @staticmethod
74    def create_new(path, base_version, new_version):
75        if os.path.exists(path):
76            raise ValueError(
77                "Attempted to create a new commitlist but one exists already!"
78            )
79        commits = CommitList.get_commits_between(base_version, new_version)
80        return CommitList(path, commits)
81
82    @staticmethod
83    def read_from_disk(path) -> List[Commit]:
84        with open(path) as csvfile:
85            reader = csv.DictReader(csvfile)
86            rows = []
87            for row in reader:
88                if row.get("new_title", "") != "":
89                    row["title"] = row["new_title"]
90                filtered_rows = {k: row.get(k, "") for k in commit_fields}
91                rows.append(Commit(**filtered_rows))
92        return rows
93
94    def write_result(self):
95        self.write_to_disk_static(self.path, self.commits)
96
97    @staticmethod
98    def write_to_disk_static(path, commit_list):
99        os.makedirs(Path(path).parent, exist_ok=True)
100        with open(path, "w") as csvfile:
101            writer = csv.writer(csvfile)
102            writer.writerow(commit_fields)
103            for commit in commit_list:
104                writer.writerow(dataclasses.astuple(commit))
105
106    @staticmethod
107    def keywordInFile(file, keywords):
108        for key in keywords:
109            if key in file:
110                return True
111        return False
112
113    @staticmethod
114    def gen_commit(commit_hash):
115        feature_item = get_commit_data_cache().get(commit_hash)
116        features = features_to_dict(feature_item)
117        category, topic = CommitList.categorize(features)
118        a1, a2, a3 = (features["accepters"] + ("", "", ""))[:3]
119        if features["pr_number"] is not None:
120            pr_link = f"https://github.com/pytorch/pytorch/pull/{features['pr_number']}"
121        else:
122            pr_link = None
123        files_changed_str = " ".join(features["files_changed"])
124        return Commit(
125            commit_hash,
126            category,
127            topic,
128            features["title"],
129            files_changed_str,
130            pr_link,
131            features["author"],
132            a1,
133            a2,
134            a3,
135        )
136
137    @staticmethod
138    def category_remapper(category: str) -> str:
139        if category in frontend_categories:
140            category = category + "_frontend"
141            return category
142        if category == "Meta API":
143            category = "composability"
144            return category
145        if category in common.quantization.categories:
146            category = common.quantization.name
147            return category
148        if category in common.distributed.categories:
149            category = common.distributed.name
150            return category
151        return category
152
153    @staticmethod
154    def bracket_category_matcher(title: str):
155        """Categorize a commit based on the presence of a bracketed category in the title.
156
157        Args:
158            title (str): title to seaarch
159
160        Returns:
161            optional[str]
162        """
163        pairs = [
164            ("[dynamo]", "dynamo"),
165            ("[torchdynamo]", "dynamo"),
166            ("[torchinductor]", "inductor"),
167            ("[inductor]", "inductor"),
168            ("[codemod", "skip"),
169            ("[profiler]", "profiler"),
170            ("[functorch]", "functorch"),
171            ("[autograd]", "autograd_frontend"),
172            ("[quantization]", "quantization"),
173            ("[nn]", "nn_frontend"),
174            ("[complex]", "complex_frontend"),
175            ("[mps]", "mps"),
176            ("[optimizer]", "optimizer_frontend"),
177            ("[xla]", "xla"),
178        ]
179        title_lower = title.lower()
180        for bracket, category in pairs:
181            if bracket in title_lower:
182                return category
183        return None
184
185    @staticmethod
186    def categorize(features):
187        title = features["title"]
188        labels = features["labels"]
189        category = "Uncategorized"
190        topic = "Untopiced"
191
192        # Revert commits are merged directly to master with no associated PR number
193        if features["pr_number"] is None:
194            if title.startswith("Revert"):
195                return "skip", topic
196
197        # We ask contributors to label their PR's appropriately
198        # when they're first landed.
199        # Check if the labels are there first.
200        already_categorized = already_topiced = False
201        for label in labels:
202            if label.startswith("release notes: "):
203                category = label.split("release notes: ", 1)[1]
204                category = CommitList.category_remapper(category)
205                already_categorized = True
206            if label.startswith("topic: "):
207                topic = label.split("topic: ", 1)[1]
208                already_topiced = True
209        if already_categorized and already_topiced:
210            return category, topic
211
212        # update this to check if each file starts with caffe2
213        if "caffe2" in title:
214            return "caffe2", topic
215        if "Reverted" in labels:
216            return "skip", topic
217        if "module: deprecation" in labels:
218            topic = "deprecation"
219
220        found_bracket_category = CommitList.bracket_category_matcher(title)
221        if found_bracket_category:
222            return found_bracket_category, topic
223
224        files_changed = features["files_changed"]
225        for file in files_changed:
226            file_lowercase = file.lower()
227            if CommitList.keywordInFile(
228                file,
229                [
230                    "docker/",
231                    ".circleci",
232                    ".github",
233                    ".jenkins",
234                    ".ci",
235                    ".azure_pipelines",
236                ],
237            ):
238                category = "releng"
239                break
240            # datapipe(s), torch/utils/data, test_{dataloader, datapipe}
241            if CommitList.keywordInFile(
242                file, ["torch/utils/data", "test_dataloader", "test_datapipe"]
243            ):
244                category = "dataloader_frontend"
245                break
246            if CommitList.keywordInFile(file, ["torch/csrc/api", "test/cpp/api"]):
247                category = "cpp_frontend"
248                break
249            if CommitList.keywordInFile(file, ["distributed", "c10d"]):
250                category = "distributed"
251                break
252            if "vulkan" in file_lowercase:
253                category = "vulkan"
254                break
255            if "Foreach" in file_lowercase:
256                category = "foreach_frontend"
257                break
258            if "onnx" in file_lowercase:
259                category = "onnx"
260                break
261            if CommitList.keywordInFile(file, ["torch/fx", "test_fx"]):
262                category = "fx"
263                break
264            if CommitList.keywordInFile(file, ["torch/ao", "test/ao"]):
265                category = common.quantization.name
266                break
267            # torch/quantization, test/quantization, aten/src/ATen/native/quantized, torch/nn/{quantized, quantizable}
268            if CommitList.keywordInFile(
269                file,
270                [
271                    "torch/quantization",
272                    "test/quantization",
273                    "aten/src/ATen/native/quantized",
274                    "torch/nn/quantiz",
275                ],
276            ):
277                category = common.quantization.name
278                break
279            if CommitList.keywordInFile(file, ["torch/package", "test/package"]):
280                category = "package"
281                break
282            if CommitList.keywordInFile(
283                file,
284                [
285                    "torch/csrc/jit/mobile",
286                    "aten/src/ATen/native/metal",
287                    "test/mobile",
288                    "torch/backends/_nnapi/",
289                    "test/test_nnapi.py",
290                ],
291            ):
292                category = "mobile"
293                break
294            if CommitList.keywordInFile(
295                file,
296                [
297                    "aten/src/ATen/native/LinearAlgebra.cpp",
298                    "test/test_linalg.py",
299                    "torch/linalg",
300                ],
301            ):
302                category = "linalg_frontend"
303                break
304            if CommitList.keywordInFile(
305                file,
306                [
307                    "torch/sparse",
308                    "aten/src/ATen/native/sparse",
309                    "torch/_masked/__init__.py",
310                ],
311            ):
312                category = "sparse_frontend"
313                break
314            if CommitList.keywordInFile(file, ["tools/autograd"]):
315                category = "autograd_frontend"
316                break
317            if CommitList.keywordInFile(
318                file,
319                [
320                    "test/test_nn.py",
321                    "test/test_module.py",
322                    "torch/nn/modules",
323                    "torch/nn/functional.py",
324                ],
325            ):
326                category = "nn_frontend"
327                break
328            if CommitList.keywordInFile(file, ["torch/csrc/jit", "torch/jit"]):
329                category = "jit"
330                break
331            if CommitList.keywordInFile(
332                file,
333                [
334                    "torch/_meta_registrations.py",
335                    "torch/_decomp",
336                    "torch/_prims",
337                    "torch/_refs",
338                ],
339            ):
340                category = "composability"
341                break
342            if CommitList.keywordInFile(file, ["torch/_dynamo"]):
343                category = "dynamo"
344                break
345            if CommitList.keywordInFile(file, ["torch/_inductor"]):
346                category = "inductor"
347                break
348        else:
349            # Below are some extra quick checks that aren't necessarily file-path related,
350            # but I found that to catch a decent number of extra commits.
351            if len(files_changed) > 0 and all(
352                f_name.endswith((".cu", ".cuh")) for f_name in files_changed
353            ):
354                category = "cuda"
355            elif "[PyTorch Edge]" in title:
356                category = "mobile"
357            elif (
358                len(files_changed) == 1
359                and "torch/testing/_internal/common_methods_invocations.py"
360                in files_changed[0]
361            ):
362                # when this is the only file changed, it's almost always an OpInfo change.
363                category = "python_frontend"
364            elif len(files_changed) == 1 and "torch/_torch_docs.py" in files_changed[0]:
365                # individual torch_docs changes are usually for python ops
366                category = "python_frontend"
367
368        # If we couldn't find a category but the topic is not user facing we can skip these:
369        if category == "Uncategorized" and topic == "not user facing":
370            category = "skip"
371
372        return category, topic
373
374    @staticmethod
375    def get_commits_between(base_version, new_version):
376        cmd = f"git merge-base {base_version} {new_version}"
377        rc, merge_base, _ = run(cmd)
378        assert rc == 0
379
380        # Returns a list of something like
381        # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555)
382        cmd = f"git log --reverse --oneline {merge_base}..{new_version}"
383        rc, commits, _ = run(cmd)
384        assert rc == 0
385
386        log_lines = commits.split("\n")
387        hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines])
388        return [CommitList.gen_commit(commit_hash) for commit_hash in hashes]
389
390    def filter(self, *, category=None, topic=None):
391        commits = self.commits
392        if category is not None:
393            commits = [commit for commit in commits if commit.category == category]
394        if topic is not None:
395            commits = [commit for commit in commits if commit.topic == topic]
396        return commits
397
398    def update_to(self, new_version):
399        last_hash = self.commits[-1].commit_hash
400        new_commits = CommitList.get_commits_between(last_hash, new_version)
401        self.commits += new_commits
402
403    def stat(self):
404        counts = defaultdict(lambda: defaultdict(int))
405        for commit in self.commits:
406            counts[commit.category][commit.topic] += 1
407        return counts
408
409
410def create_new(path, base_version, new_version):
411    commits = CommitList.create_new(path, base_version, new_version)
412    commits.write_result()
413
414
415def update_existing(path, new_version):
416    commits = CommitList.from_existing(path)
417    commits.update_to(new_version)
418    commits.write_result()
419
420
421def rerun_with_new_filters(path):
422    current_commits = CommitList.from_existing(path)
423    for i, commit in enumerate(current_commits.commits):
424        current_category = commit.category
425        if (
426            current_category == "Uncategorized"
427            or current_category not in common.categories
428        ):
429            feature_item = get_commit_data_cache().get(commit.commit_hash)
430            features = features_to_dict(feature_item)
431            category, topic = CommitList.categorize(features)
432            current_commits.commits[i] = dataclasses.replace(
433                commit, category=category, topic=topic
434            )
435    current_commits.write_result()
436
437
438def get_hash_or_pr_url(commit: Commit):
439    # cdc = get_commit_data_cache()
440    pr_link = commit.pr_link
441    if pr_link is None:
442        return commit.commit_hash
443    else:
444        regex = r"https://github.com/pytorch/pytorch/pull/([0-9]+)"
445        matches = re.findall(regex, pr_link)
446        if len(matches) == 0:
447            return commit.commit_hash
448
449        return f"[#{matches[0]}]({pr_link})"
450
451
452def to_markdown(commit_list: CommitList, category):
453    def cleanup_title(commit):
454        match = re.match(r"(.*) \(#\d+\)", commit.title)
455        if match is None:
456            return commit.title
457        return match.group(1)
458
459    merge_mapping = defaultdict(list)
460    for commit in commit_list.commits:
461        if commit.merge_into:
462            merge_mapping[commit.merge_into].append(commit)
463
464    cdc = get_commit_data_cache()
465    lines = [f"\n## {category}\n"]
466    for topic in topics:
467        lines.append(f"### {topic}\n")
468        commits = commit_list.filter(category=category, topic=topic)
469        if "_" in topic:
470            commits.extend(
471                commit_list.filter(category=category, topic=topic.replace("_", " "))
472            )
473        if " " in topic:
474            commits.extend(
475                commit_list.filter(category=category, topic=topic.replace(" ", "_"))
476            )
477        for commit in commits:
478            if commit.merge_into:
479                continue
480            all_related_commits = merge_mapping[commit.commit_hash] + [commit]
481            commit_list_md = ", ".join(
482                get_hash_or_pr_url(c) for c in all_related_commits
483            )
484            result = f"- {cleanup_title(commit)} ({commit_list_md})\n"
485            lines.append(result)
486    return lines
487
488
489def get_markdown_header(category):
490    header = f"""
491# Release Notes worksheet {category}
492
493The main goal of this process is to rephrase all the commit messages below to make them clear and easy to read by the end user. You should follow the following instructions to do so:
494
495* **Please cleanup, and format commit titles to be readable by the general pytorch user.** [Detailed instructions here](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit)
496* Please sort commits into the following categories (you should not rename the categories!), I tried to pre-sort these to ease your work, feel free to move commits around if the current categorization is not good.
497* Please drop any commits that are not user-facing.
498* If anything is from another domain, leave it in the UNTOPICED section at the end and I'll come and take care of it.
499* Please use markdown format
500* Please use #PR_NUM to link to the PR, instead of `[#PR_NUM](https://github.com/pytorch/pytorch/pull/#PR_NUM)` to reduce the length of the release notes
501* We place a lot of emphasis on the “BC-breaking” and “deprecation” sections. Those should be where the most effort goes in. The “improvements” and “bug fixes” for Python API should be nice as well. Everything else doesn’t matter too much so feel free to cut corners if time is short.
502
503The categories below are as follows:
504
505* BC breaking: All commits that are BC-breaking. These are the most important commits. If any pre-sorted commit is actually BC-breaking, do move it to this section. Each commit should contain a paragraph explaining the rational behind the change as well as an example for how to update user code [BC-Guidelines](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit#heading=h.a9htwgvvec1m).
506* Deprecations: All commits introducing deprecation. Each commit should include a small example explaining what should be done to update user code.
507* new_features: All commits introducing a new feature (new functions, new submodule, new supported platform etc)
508* improvements: All commits providing improvements to existing feature should be here (new backend for a function, new argument, better numerical stability)
509* bug fixes: All commits that fix bugs and behaviors that do not match the documentation
510* performance: All commits that are added mainly for performance (we separate this from improvements above to make it easier for users to look for it)
511* documentation: All commits that add/update documentation
512* Developers: All commits that are not end-user facing but still impact people that compile from source, develop into pytorch, extend pytorch, etc
513"""
514
515    return [header]
516
517
518def main():
519    parser = argparse.ArgumentParser(description="Tool to create a commit list")
520
521    group = parser.add_mutually_exclusive_group(required=True)
522    group.add_argument("--create-new", "--create_new", nargs=2)
523    group.add_argument("--update-to", "--update_to")
524    # I found this flag useful when experimenting with adding new auto-categorizing filters.
525    # After running commitlist.py the first time, if you add any new filters in this file,
526    # re-running with "rerun_with_new_filters" will update the existing commitlist.csv file,
527    # but only affect the rows that were previously marked as "Uncategorized"
528    group.add_argument(
529        "--rerun-with-new-filters", "--rerun_with_new_filters", action="store_true"
530    )
531    group.add_argument("--stat", action="store_true")
532    group.add_argument("--export-markdown", "--export_markdown", action="store_true")
533    group.add_argument(
534        "--export-csv-categories", "--export_csv_categories", action="store_true"
535    )
536    parser.add_argument("--path", default="results/commitlist.csv")
537    args = parser.parse_args()
538
539    if args.create_new:
540        create_new(args.path, args.create_new[0], args.create_new[1])
541        print(
542            "Finished creating new commit list. Results have been saved to results/commitlist.csv"
543        )
544        return
545    if args.update_to:
546        update_existing(args.path, args.update_to)
547        return
548    if args.rerun_with_new_filters:
549        rerun_with_new_filters(args.path)
550        return
551    if args.stat:
552        commits = CommitList.from_existing(args.path)
553        stats = commits.stat()
554        pprint.pprint(stats)
555        return
556
557    if args.export_csv_categories:
558        commits = CommitList.from_existing(args.path)
559        categories = list(commits.stat().keys())
560        for category in categories:
561            print(f"Exporting {category}...")
562            filename = f"results/export/result_{category}.csv"
563            CommitList.write_to_disk_static(filename, commits.filter(category=category))
564        return
565
566    if args.export_markdown:
567        commits = CommitList.from_existing(args.path)
568        categories = list(commits.stat().keys())
569        for category in categories:
570            print(f"Exporting {category}...")
571            lines = get_markdown_header(category)
572            lines += to_markdown(commits, category)
573            filename = f"results/export/result_{category}.md"
574            os.makedirs(os.path.dirname(filename), exist_ok=True)
575            with open(filename, "w") as f:
576                f.writelines(lines)
577        return
578    raise AssertionError
579
580
581if __name__ == "__main__":
582    main()
583