# mypy: ignore-errors import argparse import json import warnings import pandas as pd # type: ignore[import-untyped] from torch._inductor.autoheuristic.autoheuristic_utils import ( CHOICE_COL, get_metadata_str_from_log, ) # TODO (AlnisM): Fix these warnings warnings.filterwarnings( "ignore", message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", ) warnings.filterwarnings( "ignore", message="DataFrameGroupBy.apply operated on the grouping columns.", ) class AHTrain: """ Base class for AutoHeuristic training. """ def __init__(self) -> None: self.parser = argparse.ArgumentParser() self.add_base_arguments() self.args = None def add_base_arguments(self): self.parser.add_argument( "dataset", type=str, help="Path to text file containing data collected with AutoHeuristic.", ) self.parser.add_argument( "--nrows", type=int, default=None, help="Only read first n rows of the dataset.", ) self.parser.add_argument( "--heuristic-name", type=str, default="learned_heuristic", help="Name of the heuristic to be generated.", ) self.parser.add_argument( "--data", nargs=2, action="append", metavar=("TYPE", "PATH"), help="Specify name of datasets and file paths to be evaluated.", ) self.parser.add_argument( "--save-dot", action="store_true", help="Export heuristic to graphviz dot.", ) self.parser.add_argument( "--ranking", type=int, default=None, help=""" Makes AutoHeuristic learn a heuristic that ranks choices instead of predicting a single choice. The argument is the number of choices the heuristic will provide. """, ) def parse_args(self): return self.parser.parse_args() def parse_log(self, log_path, nrows=None): (df, metadata) = self.deserialize_data(log_path) numerical_features = metadata["numerical_features"] categorical_features = metadata["categorical_features"] choices = df[CHOICE_COL].unique().tolist() features = numerical_features + categorical_features if nrows is not None: df = df.head(nrows) df = self.filter_df(df) return (df, metadata, features, categorical_features, choices) def generate_heuristic(self): self.args = self.parse_args() self.main( self.args.dataset, self.args.data, self.args.nrows, self.args.heuristic_name, self.args.save_dot, self.args.ranking is not None, ) def filter_df(self, df): return df def add_new_features(self, results): return (results, []) def add_real_datasets(self, datasets, other_datasets, cat_feature2cats): if other_datasets: for name, path in other_datasets: (df_other, choices, _, _, _) = self.get_df( path, cat_feature2cats=cat_feature2cats, apply_filters=False ) datasets[name] = df_other def handle_categorical_features( self, cat_feature2cats, categorical_features, results ): # Doing this here because if we create another df for testing purposes # and that other df does not contain all categories for a categorical feature, # pd.dummies will not create columns for the missing categories if not cat_feature2cats: cat_feature2cats = {} for cat_feature in categorical_features: if cat_feature in cat_feature2cats: categories = cat_feature2cats[cat_feature] else: categories = results[cat_feature].unique() cat_feature2cats[cat_feature] = categories results[cat_feature] = pd.Categorical( results[cat_feature], categories=categories ) dummy_col_2_col_val = {} for col in categorical_features: unique_vals = results[col].unique() for val in unique_vals: dummy_col_2_col_val[f"{col}_{val}"] = (col, val) # one-hot encode categorical features results = pd.get_dummies(results, columns=categorical_features) return (results, cat_feature2cats, dummy_col_2_col_val) def gen_precondition(self, opt_name, shared_memory, device_capa): return f""" def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool: return ( metadata.name == self.get_name() and metadata.shared_memory == {shared_memory} and str(metadata.device_capa) == "{device_capa}" )""" def codegen_boilerplate( self, heuristic_name, opt_name, threshold, shared_memory, device_capa, dt ): pass def gen_predict_fn_def(self): pass def write_heuristic_to_file(self, lines, heuristic_name): output_file = ( f"../../../torch/_inductor/autoheuristic/artifacts/_{heuristic_name}.py" ) path = f"{output_file}" with open(path, "w") as f: f.write("\n".join(lines) + "\n") def deserialize_data(self, log_path): json_string = get_metadata_str_from_log(log_path) metadata = self.deserialize_metadata(json_string) df = pd.read_csv(log_path, skiprows=1, on_bad_lines="skip") return (df, metadata) def deserialize_metadata(self, json_string): return json.loads(json_string) if __name__ == "__main__": train = AHTrain() train.generate_heuristic()