1import argparse 2import csv 3import dataclasses 4import os 5import pprint 6import re 7from collections import defaultdict 8from pathlib import Path 9from typing import List 10 11import common 12from common import ( 13 features_to_dict, 14 frontend_categories, 15 get_commit_data_cache, 16 run, 17 topics, 18) 19 20 21""" 22Example Usages 23 24Create a new commitlist for consumption by categorize.py. 25Said commitlist contains commits between v1.5.0 and f5bc91f851. 26 27 python commitlist.py --create-new tags/v1.5.0 f5bc91f851 28 29Update the existing commitlist to commit bfcb687b9c. 30 31 python commitlist.py --update-to bfcb687b9c 32 33""" 34 35 36@dataclasses.dataclass(frozen=False) 37class Commit: 38 commit_hash: str 39 category: str 40 topic: str 41 title: str 42 files_changed: str 43 pr_link: str 44 author: str 45 46 # This is not a list so that it is easier to put in a spreadsheet 47 accepter_1: str 48 accepter_2: str 49 accepter_3: str 50 51 merge_into: str = None 52 53 def __repr__(self): 54 return ( 55 f"Commit({self.commit_hash}, {self.category}, {self.topic}, {self.title})" 56 ) 57 58 59commit_fields = tuple(f.name for f in dataclasses.fields(Commit)) 60 61 62class CommitList: 63 # NB: Private ctor. Use `from_existing` or `create_new`. 64 def __init__(self, path: str, commits: List[Commit]): 65 self.path = path 66 self.commits = commits 67 68 @staticmethod 69 def from_existing(path): 70 commits = CommitList.read_from_disk(path) 71 return CommitList(path, commits) 72 73 @staticmethod 74 def create_new(path, base_version, new_version): 75 if os.path.exists(path): 76 raise ValueError( 77 "Attempted to create a new commitlist but one exists already!" 78 ) 79 commits = CommitList.get_commits_between(base_version, new_version) 80 return CommitList(path, commits) 81 82 @staticmethod 83 def read_from_disk(path) -> List[Commit]: 84 with open(path) as csvfile: 85 reader = csv.DictReader(csvfile) 86 rows = [] 87 for row in reader: 88 if row.get("new_title", "") != "": 89 row["title"] = row["new_title"] 90 filtered_rows = {k: row.get(k, "") for k in commit_fields} 91 rows.append(Commit(**filtered_rows)) 92 return rows 93 94 def write_result(self): 95 self.write_to_disk_static(self.path, self.commits) 96 97 @staticmethod 98 def write_to_disk_static(path, commit_list): 99 os.makedirs(Path(path).parent, exist_ok=True) 100 with open(path, "w") as csvfile: 101 writer = csv.writer(csvfile) 102 writer.writerow(commit_fields) 103 for commit in commit_list: 104 writer.writerow(dataclasses.astuple(commit)) 105 106 @staticmethod 107 def keywordInFile(file, keywords): 108 for key in keywords: 109 if key in file: 110 return True 111 return False 112 113 @staticmethod 114 def gen_commit(commit_hash): 115 feature_item = get_commit_data_cache().get(commit_hash) 116 features = features_to_dict(feature_item) 117 category, topic = CommitList.categorize(features) 118 a1, a2, a3 = (features["accepters"] + ("", "", ""))[:3] 119 if features["pr_number"] is not None: 120 pr_link = f"https://github.com/pytorch/pytorch/pull/{features['pr_number']}" 121 else: 122 pr_link = None 123 files_changed_str = " ".join(features["files_changed"]) 124 return Commit( 125 commit_hash, 126 category, 127 topic, 128 features["title"], 129 files_changed_str, 130 pr_link, 131 features["author"], 132 a1, 133 a2, 134 a3, 135 ) 136 137 @staticmethod 138 def category_remapper(category: str) -> str: 139 if category in frontend_categories: 140 category = category + "_frontend" 141 return category 142 if category == "Meta API": 143 category = "composability" 144 return category 145 if category in common.quantization.categories: 146 category = common.quantization.name 147 return category 148 if category in common.distributed.categories: 149 category = common.distributed.name 150 return category 151 return category 152 153 @staticmethod 154 def bracket_category_matcher(title: str): 155 """Categorize a commit based on the presence of a bracketed category in the title. 156 157 Args: 158 title (str): title to seaarch 159 160 Returns: 161 optional[str] 162 """ 163 pairs = [ 164 ("[dynamo]", "dynamo"), 165 ("[torchdynamo]", "dynamo"), 166 ("[torchinductor]", "inductor"), 167 ("[inductor]", "inductor"), 168 ("[codemod", "skip"), 169 ("[profiler]", "profiler"), 170 ("[functorch]", "functorch"), 171 ("[autograd]", "autograd_frontend"), 172 ("[quantization]", "quantization"), 173 ("[nn]", "nn_frontend"), 174 ("[complex]", "complex_frontend"), 175 ("[mps]", "mps"), 176 ("[optimizer]", "optimizer_frontend"), 177 ("[xla]", "xla"), 178 ] 179 title_lower = title.lower() 180 for bracket, category in pairs: 181 if bracket in title_lower: 182 return category 183 return None 184 185 @staticmethod 186 def categorize(features): 187 title = features["title"] 188 labels = features["labels"] 189 category = "Uncategorized" 190 topic = "Untopiced" 191 192 # Revert commits are merged directly to master with no associated PR number 193 if features["pr_number"] is None: 194 if title.startswith("Revert"): 195 return "skip", topic 196 197 # We ask contributors to label their PR's appropriately 198 # when they're first landed. 199 # Check if the labels are there first. 200 already_categorized = already_topiced = False 201 for label in labels: 202 if label.startswith("release notes: "): 203 category = label.split("release notes: ", 1)[1] 204 category = CommitList.category_remapper(category) 205 already_categorized = True 206 if label.startswith("topic: "): 207 topic = label.split("topic: ", 1)[1] 208 already_topiced = True 209 if already_categorized and already_topiced: 210 return category, topic 211 212 # update this to check if each file starts with caffe2 213 if "caffe2" in title: 214 return "caffe2", topic 215 if "Reverted" in labels: 216 return "skip", topic 217 if "module: deprecation" in labels: 218 topic = "deprecation" 219 220 found_bracket_category = CommitList.bracket_category_matcher(title) 221 if found_bracket_category: 222 return found_bracket_category, topic 223 224 files_changed = features["files_changed"] 225 for file in files_changed: 226 file_lowercase = file.lower() 227 if CommitList.keywordInFile( 228 file, 229 [ 230 "docker/", 231 ".circleci", 232 ".github", 233 ".jenkins", 234 ".ci", 235 ".azure_pipelines", 236 ], 237 ): 238 category = "releng" 239 break 240 # datapipe(s), torch/utils/data, test_{dataloader, datapipe} 241 if CommitList.keywordInFile( 242 file, ["torch/utils/data", "test_dataloader", "test_datapipe"] 243 ): 244 category = "dataloader_frontend" 245 break 246 if CommitList.keywordInFile(file, ["torch/csrc/api", "test/cpp/api"]): 247 category = "cpp_frontend" 248 break 249 if CommitList.keywordInFile(file, ["distributed", "c10d"]): 250 category = "distributed" 251 break 252 if "vulkan" in file_lowercase: 253 category = "vulkan" 254 break 255 if "Foreach" in file_lowercase: 256 category = "foreach_frontend" 257 break 258 if "onnx" in file_lowercase: 259 category = "onnx" 260 break 261 if CommitList.keywordInFile(file, ["torch/fx", "test_fx"]): 262 category = "fx" 263 break 264 if CommitList.keywordInFile(file, ["torch/ao", "test/ao"]): 265 category = common.quantization.name 266 break 267 # torch/quantization, test/quantization, aten/src/ATen/native/quantized, torch/nn/{quantized, quantizable} 268 if CommitList.keywordInFile( 269 file, 270 [ 271 "torch/quantization", 272 "test/quantization", 273 "aten/src/ATen/native/quantized", 274 "torch/nn/quantiz", 275 ], 276 ): 277 category = common.quantization.name 278 break 279 if CommitList.keywordInFile(file, ["torch/package", "test/package"]): 280 category = "package" 281 break 282 if CommitList.keywordInFile( 283 file, 284 [ 285 "torch/csrc/jit/mobile", 286 "aten/src/ATen/native/metal", 287 "test/mobile", 288 "torch/backends/_nnapi/", 289 "test/test_nnapi.py", 290 ], 291 ): 292 category = "mobile" 293 break 294 if CommitList.keywordInFile( 295 file, 296 [ 297 "aten/src/ATen/native/LinearAlgebra.cpp", 298 "test/test_linalg.py", 299 "torch/linalg", 300 ], 301 ): 302 category = "linalg_frontend" 303 break 304 if CommitList.keywordInFile( 305 file, 306 [ 307 "torch/sparse", 308 "aten/src/ATen/native/sparse", 309 "torch/_masked/__init__.py", 310 ], 311 ): 312 category = "sparse_frontend" 313 break 314 if CommitList.keywordInFile(file, ["tools/autograd"]): 315 category = "autograd_frontend" 316 break 317 if CommitList.keywordInFile( 318 file, 319 [ 320 "test/test_nn.py", 321 "test/test_module.py", 322 "torch/nn/modules", 323 "torch/nn/functional.py", 324 ], 325 ): 326 category = "nn_frontend" 327 break 328 if CommitList.keywordInFile(file, ["torch/csrc/jit", "torch/jit"]): 329 category = "jit" 330 break 331 if CommitList.keywordInFile( 332 file, 333 [ 334 "torch/_meta_registrations.py", 335 "torch/_decomp", 336 "torch/_prims", 337 "torch/_refs", 338 ], 339 ): 340 category = "composability" 341 break 342 if CommitList.keywordInFile(file, ["torch/_dynamo"]): 343 category = "dynamo" 344 break 345 if CommitList.keywordInFile(file, ["torch/_inductor"]): 346 category = "inductor" 347 break 348 else: 349 # Below are some extra quick checks that aren't necessarily file-path related, 350 # but I found that to catch a decent number of extra commits. 351 if len(files_changed) > 0 and all( 352 f_name.endswith((".cu", ".cuh")) for f_name in files_changed 353 ): 354 category = "cuda" 355 elif "[PyTorch Edge]" in title: 356 category = "mobile" 357 elif ( 358 len(files_changed) == 1 359 and "torch/testing/_internal/common_methods_invocations.py" 360 in files_changed[0] 361 ): 362 # when this is the only file changed, it's almost always an OpInfo change. 363 category = "python_frontend" 364 elif len(files_changed) == 1 and "torch/_torch_docs.py" in files_changed[0]: 365 # individual torch_docs changes are usually for python ops 366 category = "python_frontend" 367 368 # If we couldn't find a category but the topic is not user facing we can skip these: 369 if category == "Uncategorized" and topic == "not user facing": 370 category = "skip" 371 372 return category, topic 373 374 @staticmethod 375 def get_commits_between(base_version, new_version): 376 cmd = f"git merge-base {base_version} {new_version}" 377 rc, merge_base, _ = run(cmd) 378 assert rc == 0 379 380 # Returns a list of something like 381 # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555) 382 cmd = f"git log --reverse --oneline {merge_base}..{new_version}" 383 rc, commits, _ = run(cmd) 384 assert rc == 0 385 386 log_lines = commits.split("\n") 387 hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines]) 388 return [CommitList.gen_commit(commit_hash) for commit_hash in hashes] 389 390 def filter(self, *, category=None, topic=None): 391 commits = self.commits 392 if category is not None: 393 commits = [commit for commit in commits if commit.category == category] 394 if topic is not None: 395 commits = [commit for commit in commits if commit.topic == topic] 396 return commits 397 398 def update_to(self, new_version): 399 last_hash = self.commits[-1].commit_hash 400 new_commits = CommitList.get_commits_between(last_hash, new_version) 401 self.commits += new_commits 402 403 def stat(self): 404 counts = defaultdict(lambda: defaultdict(int)) 405 for commit in self.commits: 406 counts[commit.category][commit.topic] += 1 407 return counts 408 409 410def create_new(path, base_version, new_version): 411 commits = CommitList.create_new(path, base_version, new_version) 412 commits.write_result() 413 414 415def update_existing(path, new_version): 416 commits = CommitList.from_existing(path) 417 commits.update_to(new_version) 418 commits.write_result() 419 420 421def rerun_with_new_filters(path): 422 current_commits = CommitList.from_existing(path) 423 for i, commit in enumerate(current_commits.commits): 424 current_category = commit.category 425 if ( 426 current_category == "Uncategorized" 427 or current_category not in common.categories 428 ): 429 feature_item = get_commit_data_cache().get(commit.commit_hash) 430 features = features_to_dict(feature_item) 431 category, topic = CommitList.categorize(features) 432 current_commits.commits[i] = dataclasses.replace( 433 commit, category=category, topic=topic 434 ) 435 current_commits.write_result() 436 437 438def get_hash_or_pr_url(commit: Commit): 439 # cdc = get_commit_data_cache() 440 pr_link = commit.pr_link 441 if pr_link is None: 442 return commit.commit_hash 443 else: 444 regex = r"https://github.com/pytorch/pytorch/pull/([0-9]+)" 445 matches = re.findall(regex, pr_link) 446 if len(matches) == 0: 447 return commit.commit_hash 448 449 return f"[#{matches[0]}]({pr_link})" 450 451 452def to_markdown(commit_list: CommitList, category): 453 def cleanup_title(commit): 454 match = re.match(r"(.*) \(#\d+\)", commit.title) 455 if match is None: 456 return commit.title 457 return match.group(1) 458 459 merge_mapping = defaultdict(list) 460 for commit in commit_list.commits: 461 if commit.merge_into: 462 merge_mapping[commit.merge_into].append(commit) 463 464 cdc = get_commit_data_cache() 465 lines = [f"\n## {category}\n"] 466 for topic in topics: 467 lines.append(f"### {topic}\n") 468 commits = commit_list.filter(category=category, topic=topic) 469 if "_" in topic: 470 commits.extend( 471 commit_list.filter(category=category, topic=topic.replace("_", " ")) 472 ) 473 if " " in topic: 474 commits.extend( 475 commit_list.filter(category=category, topic=topic.replace(" ", "_")) 476 ) 477 for commit in commits: 478 if commit.merge_into: 479 continue 480 all_related_commits = merge_mapping[commit.commit_hash] + [commit] 481 commit_list_md = ", ".join( 482 get_hash_or_pr_url(c) for c in all_related_commits 483 ) 484 result = f"- {cleanup_title(commit)} ({commit_list_md})\n" 485 lines.append(result) 486 return lines 487 488 489def get_markdown_header(category): 490 header = f""" 491# Release Notes worksheet {category} 492 493The main goal of this process is to rephrase all the commit messages below to make them clear and easy to read by the end user. You should follow the following instructions to do so: 494 495* **Please cleanup, and format commit titles to be readable by the general pytorch user.** [Detailed instructions here](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit) 496* Please sort commits into the following categories (you should not rename the categories!), I tried to pre-sort these to ease your work, feel free to move commits around if the current categorization is not good. 497* Please drop any commits that are not user-facing. 498* If anything is from another domain, leave it in the UNTOPICED section at the end and I'll come and take care of it. 499* Please use markdown format 500* Please use #PR_NUM to link to the PR, instead of `[#PR_NUM](https://github.com/pytorch/pytorch/pull/#PR_NUM)` to reduce the length of the release notes 501* We place a lot of emphasis on the “BC-breaking” and “deprecation” sections. Those should be where the most effort goes in. The “improvements” and “bug fixes” for Python API should be nice as well. Everything else doesn’t matter too much so feel free to cut corners if time is short. 502 503The categories below are as follows: 504 505* BC breaking: All commits that are BC-breaking. These are the most important commits. If any pre-sorted commit is actually BC-breaking, do move it to this section. Each commit should contain a paragraph explaining the rational behind the change as well as an example for how to update user code [BC-Guidelines](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit#heading=h.a9htwgvvec1m). 506* Deprecations: All commits introducing deprecation. Each commit should include a small example explaining what should be done to update user code. 507* new_features: All commits introducing a new feature (new functions, new submodule, new supported platform etc) 508* improvements: All commits providing improvements to existing feature should be here (new backend for a function, new argument, better numerical stability) 509* bug fixes: All commits that fix bugs and behaviors that do not match the documentation 510* performance: All commits that are added mainly for performance (we separate this from improvements above to make it easier for users to look for it) 511* documentation: All commits that add/update documentation 512* Developers: All commits that are not end-user facing but still impact people that compile from source, develop into pytorch, extend pytorch, etc 513""" 514 515 return [header] 516 517 518def main(): 519 parser = argparse.ArgumentParser(description="Tool to create a commit list") 520 521 group = parser.add_mutually_exclusive_group(required=True) 522 group.add_argument("--create-new", "--create_new", nargs=2) 523 group.add_argument("--update-to", "--update_to") 524 # I found this flag useful when experimenting with adding new auto-categorizing filters. 525 # After running commitlist.py the first time, if you add any new filters in this file, 526 # re-running with "rerun_with_new_filters" will update the existing commitlist.csv file, 527 # but only affect the rows that were previously marked as "Uncategorized" 528 group.add_argument( 529 "--rerun-with-new-filters", "--rerun_with_new_filters", action="store_true" 530 ) 531 group.add_argument("--stat", action="store_true") 532 group.add_argument("--export-markdown", "--export_markdown", action="store_true") 533 group.add_argument( 534 "--export-csv-categories", "--export_csv_categories", action="store_true" 535 ) 536 parser.add_argument("--path", default="results/commitlist.csv") 537 args = parser.parse_args() 538 539 if args.create_new: 540 create_new(args.path, args.create_new[0], args.create_new[1]) 541 print( 542 "Finished creating new commit list. Results have been saved to results/commitlist.csv" 543 ) 544 return 545 if args.update_to: 546 update_existing(args.path, args.update_to) 547 return 548 if args.rerun_with_new_filters: 549 rerun_with_new_filters(args.path) 550 return 551 if args.stat: 552 commits = CommitList.from_existing(args.path) 553 stats = commits.stat() 554 pprint.pprint(stats) 555 return 556 557 if args.export_csv_categories: 558 commits = CommitList.from_existing(args.path) 559 categories = list(commits.stat().keys()) 560 for category in categories: 561 print(f"Exporting {category}...") 562 filename = f"results/export/result_{category}.csv" 563 CommitList.write_to_disk_static(filename, commits.filter(category=category)) 564 return 565 566 if args.export_markdown: 567 commits = CommitList.from_existing(args.path) 568 categories = list(commits.stat().keys()) 569 for category in categories: 570 print(f"Exporting {category}...") 571 lines = get_markdown_header(category) 572 lines += to_markdown(commits, category) 573 filename = f"results/export/result_{category}.md" 574 os.makedirs(os.path.dirname(filename), exist_ok=True) 575 with open(filename, "w") as f: 576 f.writelines(lines) 577 return 578 raise AssertionError 579 580 581if __name__ == "__main__": 582 main() 583