benchmarks/gpt_fast/generate.py

*da0073e9SAndroid Build Coastguard Workerimport dataclasses
*da0073e9SAndroid Build Coastguard Workerimport itertools
*da0073e9SAndroid Build Coastguard Workerimport platform
*da0073e9SAndroid Build Coastguard Workerimport time
*da0073e9SAndroid Build Coastguard Workerfrom typing import Optional, Tuple
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerfrom mixtral_moe_model import ConditionalFeedForward, Transformer as MixtralMoE
*da0073e9SAndroid Build Coastguard Workerfrom mixtral_moe_quantize import (
*da0073e9SAndroid Build Coastguard Worker    ConditionalFeedForwardInt8,
*da0073e9SAndroid Build Coastguard Worker    WeightOnlyInt8QuantHandler as MixtralMoEWeightOnlyInt8QuantHandler,
*da0073e9SAndroid Build Coastguard Worker)
*da0073e9SAndroid Build Coastguard Workerfrom model import Transformer as LLaMA
*da0073e9SAndroid Build Coastguard Workerfrom quantize import WeightOnlyInt8QuantHandler as LLaMAWeightOnlyInt8QuantHandler
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerimport torch
*da0073e9SAndroid Build Coastguard Workerimport torch._inductor.config
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workertorch._inductor.config.coordinate_descent_tuning = True
*da0073e9SAndroid Build Coastguard Workertorch._inductor.config.triton.unique_kernel_names = True
*da0073e9SAndroid Build Coastguard Workertorch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
*da0073e9SAndroid Build Coastguard Workertorch._inductor.config.assert_indirect_indexing = False
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker@dataclasses.dataclass
*da0073e9SAndroid Build Coastguard Workerclass GPTModelConfig:
*da0073e9SAndroid Build Coastguard Worker    name: str
*da0073e9SAndroid Build Coastguard Worker    module: type
*da0073e9SAndroid Build Coastguard Worker    mode: Optional[str]
*da0073e9SAndroid Build Coastguard Worker    quantizer: type
*da0073e9SAndroid Build Coastguard Worker    token_per_sec: float
*da0073e9SAndroid Build Coastguard Worker    memory_bandwidth: float
*da0073e9SAndroid Build Coastguard Worker    compilation_time: float
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerdef device_sync(device):
*da0073e9SAndroid Build Coastguard Worker    if "cuda" in device:
*da0073e9SAndroid Build Coastguard Worker        torch.cuda.synchronize(device)
*da0073e9SAndroid Build Coastguard Worker    elif "cpu" in device:
*da0073e9SAndroid Build Coastguard Worker        pass
*da0073e9SAndroid Build Coastguard Worker    else:
*da0073e9SAndroid Build Coastguard Worker        print(f"device={device} is not yet suppported")
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerdef get_arch_name() -> str:
*da0073e9SAndroid Build Coastguard Worker    if torch.cuda.is_available():
*da0073e9SAndroid Build Coastguard Worker        return torch.cuda.get_device_name()
*da0073e9SAndroid Build Coastguard Worker    else:
*da0073e9SAndroid Build Coastguard Worker        # This returns x86_64 or arm64 (for aarch64)
*da0073e9SAndroid Build Coastguard Worker        return platform.machine()
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerdef multinomial_sample_one_no_sync(
*da0073e9SAndroid Build Coastguard Worker    probs_sort,
*da0073e9SAndroid Build Coastguard Worker):  # Does multinomial sampling without a cuda synchronization
*da0073e9SAndroid Build Coastguard Worker    q = torch.empty_like(probs_sort).exponential_(1)
*da0073e9SAndroid Build Coastguard Worker    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerdef logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
*da0073e9SAndroid Build Coastguard Worker    logits = logits / max(temperature, 1e-5)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    if top_k is not None:
*da0073e9SAndroid Build Coastguard Worker        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
*da0073e9SAndroid Build Coastguard Worker        pivot = v.select(-1, -1).unsqueeze(-1)
*da0073e9SAndroid Build Coastguard Worker        logits = torch.where(logits < pivot, -float("Inf"), logits)
*da0073e9SAndroid Build Coastguard Worker    probs = torch.nn.functional.softmax(logits, dim=-1)
*da0073e9SAndroid Build Coastguard Worker    return probs
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerdef sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
*da0073e9SAndroid Build Coastguard Worker    probs = logits_to_probs(logits[0, -1], temperature, top_k)
*da0073e9SAndroid Build Coastguard Worker    idx_next = multinomial_sample_one_no_sync(probs)
*da0073e9SAndroid Build Coastguard Worker    return idx_next, probs
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker@torch.compile(fullgraph=True)
*da0073e9SAndroid Build Coastguard Workerdef prefill(
*da0073e9SAndroid Build Coastguard Worker    model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
*da0073e9SAndroid Build Coastguard Worker) -> torch.Tensor:
*da0073e9SAndroid Build Coastguard Worker    # input_pos: [B, S]
*da0073e9SAndroid Build Coastguard Worker    logits = model(x, input_pos)
*da0073e9SAndroid Build Coastguard Worker    return sample(logits, **sampling_kwargs)[0]
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker@torch.compile(fullgraph=True, mode="reduce-overhead")
*da0073e9SAndroid Build Coastguard Workerdef decode_one_token(
*da0073e9SAndroid Build Coastguard Worker    model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
*da0073e9SAndroid Build Coastguard Worker) -> Tuple[torch.Tensor, torch.Tensor]:
*da0073e9SAndroid Build Coastguard Worker    # input_pos: [B, 1]
*da0073e9SAndroid Build Coastguard Worker    assert input_pos.shape[-1] == 1
*da0073e9SAndroid Build Coastguard Worker    logits = model(x, input_pos)
*da0073e9SAndroid Build Coastguard Worker    return sample(logits, **sampling_kwargs)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerdef decode_n_tokens(
*da0073e9SAndroid Build Coastguard Worker    model: torch.nn.Module,
*da0073e9SAndroid Build Coastguard Worker    cur_token: torch.Tensor,
*da0073e9SAndroid Build Coastguard Worker    input_pos: torch.Tensor,
*da0073e9SAndroid Build Coastguard Worker    num_new_tokens: int,
*da0073e9SAndroid Build Coastguard Worker    **sampling_kwargs,
*da0073e9SAndroid Build Coastguard Worker):
*da0073e9SAndroid Build Coastguard Worker    new_tokens, new_probs = [], []
*da0073e9SAndroid Build Coastguard Worker    for i in range(num_new_tokens):
*da0073e9SAndroid Build Coastguard Worker        with torch.nn.attention.sdpa_kernel(
*da0073e9SAndroid Build Coastguard Worker            torch.nn.attention.SDPBackend.MATH
*da0073e9SAndroid Build Coastguard Worker        ):  # Actually better for Inductor to codegen attention here
*da0073e9SAndroid Build Coastguard Worker            next_token, next_prob = decode_one_token(
*da0073e9SAndroid Build Coastguard Worker                model, cur_token, input_pos, **sampling_kwargs
*da0073e9SAndroid Build Coastguard Worker            )
*da0073e9SAndroid Build Coastguard Worker            input_pos += 1
*da0073e9SAndroid Build Coastguard Worker            new_tokens.append(next_token.clone())
*da0073e9SAndroid Build Coastguard Worker            new_probs.append(next_prob.clone())
*da0073e9SAndroid Build Coastguard Worker            cur_token = next_token.view(1, -1)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    return new_tokens, new_probs
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker@torch.no_grad()
*da0073e9SAndroid Build Coastguard Workerdef generate(
*da0073e9SAndroid Build Coastguard Worker    model: torch.nn.Module, prompt: torch.Tensor, max_new_tokens: int, **sampling_kwargs
*da0073e9SAndroid Build Coastguard Worker) -> torch.Tensor:
*da0073e9SAndroid Build Coastguard Worker    device, dtype = prompt.device, prompt.dtype
*da0073e9SAndroid Build Coastguard Worker    T = prompt.size(0)
*da0073e9SAndroid Build Coastguard Worker    T_new = T + max_new_tokens
*da0073e9SAndroid Build Coastguard Worker    max_seq_length = min(T_new, model.config.block_size)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    with torch.device(device):
*da0073e9SAndroid Build Coastguard Worker        model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    # create an empty tensor of the expected final shape and fill in the current tokens
*da0073e9SAndroid Build Coastguard Worker    empty = torch.empty(T_new, dtype=dtype, device=device)
*da0073e9SAndroid Build Coastguard Worker    empty[:T] = prompt
*da0073e9SAndroid Build Coastguard Worker    seq = empty
*da0073e9SAndroid Build Coastguard Worker    input_pos = torch.arange(0, T, device=device)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    next_token = prefill(model, prompt.view(1, -1), input_pos, **sampling_kwargs)
*da0073e9SAndroid Build Coastguard Worker    seq[T] = next_token
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    input_pos = torch.tensor([T], device=device, dtype=torch.int)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    generated_tokens, _ = decode_n_tokens(
*da0073e9SAndroid Build Coastguard Worker        model, next_token.view(1, -1), input_pos, max_new_tokens - 1, **sampling_kwargs
*da0073e9SAndroid Build Coastguard Worker    )
*da0073e9SAndroid Build Coastguard Worker    seq[T + 1 :] = torch.cat(generated_tokens)
*da0073e9SAndroid Build Coastguard Worker    return seq
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerdef _load_model(x: GPTModelConfig, device="cuda", precision=torch.bfloat16):
*da0073e9SAndroid Build Coastguard Worker    with torch.device("meta"):
*da0073e9SAndroid Build Coastguard Worker        model = x.module.from_name(x.name)
*da0073e9SAndroid Build Coastguard Worker    model = model.to(dtype=precision)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    if x.mode == "int8":
*da0073e9SAndroid Build Coastguard Worker        print("Using int8 weight-only quantization!")
*da0073e9SAndroid Build Coastguard Worker        model = x.quantizer(model).convert_for_runtime()
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    state_dict = model.state_dict()
*da0073e9SAndroid Build Coastguard Worker    for k, v in state_dict.items():
*da0073e9SAndroid Build Coastguard Worker        state_dict[k] = torch.nn.Parameter(
*da0073e9SAndroid Build Coastguard Worker            torch.randn(v.shape, device=device).to(dtype=v.dtype),
*da0073e9SAndroid Build Coastguard Worker            requires_grad=v.requires_grad,
*da0073e9SAndroid Build Coastguard Worker        )
*da0073e9SAndroid Build Coastguard Worker    model.load_state_dict(state_dict, assign=True)
*da0073e9SAndroid Build Coastguard Worker    return model.eval()
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker# Only count activated parameters and buffers.
*da0073e9SAndroid Build Coastguard Workerdef _get_model_size(model):
*da0073e9SAndroid Build Coastguard Worker    model_size = 0
*da0073e9SAndroid Build Coastguard Worker    for name, child in model.named_children():
*da0073e9SAndroid Build Coastguard Worker        if not isinstance(child, torch.nn.Embedding):
*da0073e9SAndroid Build Coastguard Worker            model_size += sum(
*da0073e9SAndroid Build Coastguard Worker                p.numel() * p.dtype.itemsize
*da0073e9SAndroid Build Coastguard Worker                for p in itertools.chain(child.parameters(), child.buffers())
*da0073e9SAndroid Build Coastguard Worker            )
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    # Remove the inactivated experts from the model size if this is mixture of experts
*da0073e9SAndroid Build Coastguard Worker    # architecture, since only activated experts are loaded.
*da0073e9SAndroid Build Coastguard Worker    if hasattr(model.config, "num_experts"):
*da0073e9SAndroid Build Coastguard Worker        config = model.config
*da0073e9SAndroid Build Coastguard Worker        for submodule in model.modules():
*da0073e9SAndroid Build Coastguard Worker            if isinstance(
*da0073e9SAndroid Build Coastguard Worker                submodule, (ConditionalFeedForward, ConditionalFeedForwardInt8)
*da0073e9SAndroid Build Coastguard Worker            ):
*da0073e9SAndroid Build Coastguard Worker                model_size -= (
*da0073e9SAndroid Build Coastguard Worker                    sum(
*da0073e9SAndroid Build Coastguard Worker                        p.numel() * p.dtype.itemsize
*da0073e9SAndroid Build Coastguard Worker                        for p in itertools.chain(
*da0073e9SAndroid Build Coastguard Worker                            submodule.parameters(), child.buffers()
*da0073e9SAndroid Build Coastguard Worker                        )
*da0073e9SAndroid Build Coastguard Worker                    )
*da0073e9SAndroid Build Coastguard Worker                    * (config.num_experts - config.num_activated_experts)
*da0073e9SAndroid Build Coastguard Worker                    / config.num_experts
*da0073e9SAndroid Build Coastguard Worker                )
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    return model_size
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workerdef run_experiment(
*da0073e9SAndroid Build Coastguard Worker    x: GPTModelConfig,
*da0073e9SAndroid Build Coastguard Worker    num_samples: int = 5,
*da0073e9SAndroid Build Coastguard Worker    max_new_tokens: int = 200,
*da0073e9SAndroid Build Coastguard Worker    top_k: int = 200,
*da0073e9SAndroid Build Coastguard Worker    temperature: float = 0.8,
*da0073e9SAndroid Build Coastguard Worker    device: str = "cuda",
*da0073e9SAndroid Build Coastguard Worker) -> None:
*da0073e9SAndroid Build Coastguard Worker    print(f"Loading model {x.name}")
*da0073e9SAndroid Build Coastguard Worker    t0 = time.time()
*da0073e9SAndroid Build Coastguard Worker    model = _load_model(x, device=device)
*da0073e9SAndroid Build Coastguard Worker    device_sync(device=device)  # MKG
*da0073e9SAndroid Build Coastguard Worker    print(f"Time to load model: {time.time() - t0:.02f} seconds")
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    prompt = torch.tensor(
*da0073e9SAndroid Build Coastguard Worker        [1, 15043, 29892, 590, 1024, 338], device=device, dtype=torch.int32
*da0073e9SAndroid Build Coastguard Worker    )
*da0073e9SAndroid Build Coastguard Worker    prompt_length = prompt.size(0)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    torch.manual_seed(1234)
*da0073e9SAndroid Build Coastguard Worker    model_size = _get_model_size(model)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    aggregate_metrics = {"tokens_per_sec": [], "memory_bandwidth": []}
*da0073e9SAndroid Build Coastguard Worker    start = -1
*da0073e9SAndroid Build Coastguard Worker    compilation_time = None
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    for i in range(start, num_samples):
*da0073e9SAndroid Build Coastguard Worker        device_sync(device=device)  # MKG
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker        t0 = time.perf_counter()
*da0073e9SAndroid Build Coastguard Worker        y = generate(
*da0073e9SAndroid Build Coastguard Worker            model, prompt, max_new_tokens, temperature=temperature, top_k=top_k
*da0073e9SAndroid Build Coastguard Worker        )
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker        if i == -1:
*da0073e9SAndroid Build Coastguard Worker            compilation_time = time.perf_counter() - t0
*da0073e9SAndroid Build Coastguard Worker            print(f"Compilation time: {compilation_time:.2f} seconds")
*da0073e9SAndroid Build Coastguard Worker            continue
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker        device_sync(device=device)  # MKG
*da0073e9SAndroid Build Coastguard Worker        t = time.perf_counter() - t0
*da0073e9SAndroid Build Coastguard Worker        tokens_generated = y.size(0) - prompt_length
*da0073e9SAndroid Build Coastguard Worker        tokens_sec = tokens_generated / t
*da0073e9SAndroid Build Coastguard Worker        aggregate_metrics["tokens_per_sec"].append(tokens_sec)
*da0073e9SAndroid Build Coastguard Worker        aggregate_metrics["memory_bandwidth"].append(model_size * tokens_sec / 1e9)
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    token_per_sec = torch.mean(torch.tensor(aggregate_metrics["tokens_per_sec"])).item()
*da0073e9SAndroid Build Coastguard Worker    memory_bandwidth = torch.mean(
*da0073e9SAndroid Build Coastguard Worker        torch.tensor(aggregate_metrics["memory_bandwidth"])
*da0073e9SAndroid Build Coastguard Worker    ).item()
*da0073e9SAndroid Build Coastguard Worker    print(f"Average tokens/sec: {token_per_sec:.2f} tokens/sec")
*da0073e9SAndroid Build Coastguard Worker    print(f"Average bandwidth achieved: {memory_bandwidth:.02f} GB/s")
*da0073e9SAndroid Build Coastguard Worker    print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
*da0073e9SAndroid Build Coastguard Worker    return token_per_sec, memory_bandwidth, compilation_time
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
*da0073e9SAndroid Build Coastguard Workerdef run_llama2_7b_bf16(device: str = "cuda"):
*da0073e9SAndroid Build Coastguard Worker    from benchmark import Experiment
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    model = GPTModelConfig(
*da0073e9SAndroid Build Coastguard Worker        "Llama-2-7b-chat-hf",
*da0073e9SAndroid Build Coastguard Worker        LLaMA,
*da0073e9SAndroid Build Coastguard Worker        "bfloat16",
*da0073e9SAndroid Build Coastguard Worker        LLaMAWeightOnlyInt8QuantHandler,
*da0073e9SAndroid Build Coastguard Worker        94,
*da0073e9SAndroid Build Coastguard Worker        1253,
*da0073e9SAndroid Build Coastguard Worker        162,
*da0073e9SAndroid Build Coastguard Worker    )
*da0073e9SAndroid Build Coastguard Worker    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
*da0073e9SAndroid Build Coastguard Worker        model, device=device
*da0073e9SAndroid Build Coastguard Worker    )
*da0073e9SAndroid Build Coastguard Worker    return [
*da0073e9SAndroid Build Coastguard Worker        Experiment(
*da0073e9SAndroid Build Coastguard Worker            model.name,
*da0073e9SAndroid Build Coastguard Worker            "token_per_sec",
*da0073e9SAndroid Build Coastguard Worker            model.token_per_sec,
*da0073e9SAndroid Build Coastguard Worker            f"{token_per_sec:.02f}",
*da0073e9SAndroid Build Coastguard Worker            model.mode,
*da0073e9SAndroid Build Coastguard Worker            device,
*da0073e9SAndroid Build Coastguard Worker            get_arch_name(),
*da0073e9SAndroid Build Coastguard Worker            True,
*da0073e9SAndroid Build Coastguard Worker        ),
*da0073e9SAndroid Build Coastguard Worker        Experiment(
*da0073e9SAndroid Build Coastguard Worker            model.name,
*da0073e9SAndroid Build Coastguard Worker            "memory_bandwidth(GB/s)",
*da0073e9SAndroid Build Coastguard Worker            model.memory_bandwidth,
*da0073e9SAndroid Build Coastguard Worker            f"{memory_bandwidth:.02f}",
*da0073e9SAndroid Build Coastguard Worker            model.mode,
*da0073e9SAndroid Build Coastguard Worker            device,
*da0073e9SAndroid Build Coastguard Worker            get_arch_name(),
*da0073e9SAndroid Build Coastguard Worker            True,
*da0073e9SAndroid Build Coastguard Worker        ),
*da0073e9SAndroid Build Coastguard Worker        Experiment(
*da0073e9SAndroid Build Coastguard Worker            model.name,
*da0073e9SAndroid Build Coastguard Worker            "compilation_time(s)",
*da0073e9SAndroid Build Coastguard Worker            model.compilation_time,
*da0073e9SAndroid Build Coastguard Worker            f"{compilation_time:.02f}",
*da0073e9SAndroid Build Coastguard Worker            model.mode,
*da0073e9SAndroid Build Coastguard Worker            device,
*da0073e9SAndroid Build Coastguard Worker            get_arch_name(),
*da0073e9SAndroid Build Coastguard Worker            True,
*da0073e9SAndroid Build Coastguard Worker        ),
*da0073e9SAndroid Build Coastguard Worker    ]
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
*da0073e9SAndroid Build Coastguard Workerdef run_llama2_7b_int8(device: str = "cuda"):
*da0073e9SAndroid Build Coastguard Worker    from benchmark import Experiment
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    model = GPTModelConfig(
*da0073e9SAndroid Build Coastguard Worker        "Llama-2-7b-chat-hf",
*da0073e9SAndroid Build Coastguard Worker        LLaMA,
*da0073e9SAndroid Build Coastguard Worker        "int8",
*da0073e9SAndroid Build Coastguard Worker        LLaMAWeightOnlyInt8QuantHandler,
*da0073e9SAndroid Build Coastguard Worker        144,
*da0073e9SAndroid Build Coastguard Worker        957,
*da0073e9SAndroid Build Coastguard Worker        172,
*da0073e9SAndroid Build Coastguard Worker    )
*da0073e9SAndroid Build Coastguard Worker    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
*da0073e9SAndroid Build Coastguard Worker        model, device=device
*da0073e9SAndroid Build Coastguard Worker    )
*da0073e9SAndroid Build Coastguard Worker    return [
*da0073e9SAndroid Build Coastguard Worker        Experiment(
*da0073e9SAndroid Build Coastguard Worker            model.name,
*da0073e9SAndroid Build Coastguard Worker            "token_per_sec",
*da0073e9SAndroid Build Coastguard Worker            model.token_per_sec,
*da0073e9SAndroid Build Coastguard Worker            f"{token_per_sec:.02f}",
*da0073e9SAndroid Build Coastguard Worker            model.mode,
*da0073e9SAndroid Build Coastguard Worker            device,
*da0073e9SAndroid Build Coastguard Worker            get_arch_name(),
*da0073e9SAndroid Build Coastguard Worker            True,
*da0073e9SAndroid Build Coastguard Worker        ),
*da0073e9SAndroid Build Coastguard Worker        Experiment(
*da0073e9SAndroid Build Coastguard Worker            model.name,
*da0073e9SAndroid Build Coastguard Worker            "memory_bandwidth(GB/s)",
*da0073e9SAndroid Build Coastguard Worker            model.memory_bandwidth,
*da0073e9SAndroid Build Coastguard Worker            f"{memory_bandwidth:.02f}",
*da0073e9SAndroid Build Coastguard Worker            model.mode,
*da0073e9SAndroid Build Coastguard Worker            device,
*da0073e9SAndroid Build Coastguard Worker            get_arch_name(),
*da0073e9SAndroid Build Coastguard Worker            True,
*da0073e9SAndroid Build Coastguard Worker        ),
*da0073e9SAndroid Build Coastguard Worker        Experiment(
*da0073e9SAndroid Build Coastguard Worker            model.name,
*da0073e9SAndroid Build Coastguard Worker            "compilation_time(s)",
*da0073e9SAndroid Build Coastguard Worker            model.compilation_time,
*da0073e9SAndroid Build Coastguard Worker            f"{compilation_time:.02f}",
*da0073e9SAndroid Build Coastguard Worker            model.mode,
*da0073e9SAndroid Build Coastguard Worker            device,
*da0073e9SAndroid Build Coastguard Worker            get_arch_name(),
*da0073e9SAndroid Build Coastguard Worker            True,
*da0073e9SAndroid Build Coastguard Worker        ),
*da0073e9SAndroid Build Coastguard Worker    ]
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
*da0073e9SAndroid Build Coastguard Workerdef run_mixtral_8x7b_int8(device: str = "cuda"):
*da0073e9SAndroid Build Coastguard Worker    from benchmark import Experiment
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker    # We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
*da0073e9SAndroid Build Coastguard Worker    model = GPTModelConfig(
*da0073e9SAndroid Build Coastguard Worker        "Mixtral-8x7B-v0.1",
*da0073e9SAndroid Build Coastguard Worker        MixtralMoE,
*da0073e9SAndroid Build Coastguard Worker        "int8",
*da0073e9SAndroid Build Coastguard Worker        MixtralMoEWeightOnlyInt8QuantHandler,
*da0073e9SAndroid Build Coastguard Worker        175,
*da0073e9SAndroid Build Coastguard Worker        1130,
*da0073e9SAndroid Build Coastguard Worker        162,
*da0073e9SAndroid Build Coastguard Worker    )
*da0073e9SAndroid Build Coastguard Worker    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
*da0073e9SAndroid Build Coastguard Worker        model, device=device
*da0073e9SAndroid Build Coastguard Worker    )
*da0073e9SAndroid Build Coastguard Worker    return [
*da0073e9SAndroid Build Coastguard Worker        Experiment(
*da0073e9SAndroid Build Coastguard Worker            model.name,
*da0073e9SAndroid Build Coastguard Worker            "token_per_sec",
*da0073e9SAndroid Build Coastguard Worker            model.token_per_sec,
*da0073e9SAndroid Build Coastguard Worker            f"{token_per_sec:.02f}",
*da0073e9SAndroid Build Coastguard Worker            model.mode,
*da0073e9SAndroid Build Coastguard Worker            device,
*da0073e9SAndroid Build Coastguard Worker            get_arch_name(),
*da0073e9SAndroid Build Coastguard Worker            True,
*da0073e9SAndroid Build Coastguard Worker        ),
*da0073e9SAndroid Build Coastguard Worker        Experiment(
*da0073e9SAndroid Build Coastguard Worker            model.name,
*da0073e9SAndroid Build Coastguard Worker            "memory_bandwidth(GB/s)",
*da0073e9SAndroid Build Coastguard Worker            model.memory_bandwidth,
*da0073e9SAndroid Build Coastguard Worker            f"{memory_bandwidth:.02f}",
*da0073e9SAndroid Build Coastguard Worker            model.mode,
*da0073e9SAndroid Build Coastguard Worker            device,
*da0073e9SAndroid Build Coastguard Worker            get_arch_name(),
*da0073e9SAndroid Build Coastguard Worker            True,
*da0073e9SAndroid Build Coastguard Worker        ),
*da0073e9SAndroid Build Coastguard Worker        Experiment(
*da0073e9SAndroid Build Coastguard Worker            model.name,
*da0073e9SAndroid Build Coastguard Worker            "compilation_time(s)",
*da0073e9SAndroid Build Coastguard Worker            model.compilation_time,
*da0073e9SAndroid Build Coastguard Worker            f"{compilation_time:.02f}",
*da0073e9SAndroid Build Coastguard Worker            model.mode,
*da0073e9SAndroid Build Coastguard Worker            device,
*da0073e9SAndroid Build Coastguard Worker            get_arch_name(),
*da0073e9SAndroid Build Coastguard Worker            True,
*da0073e9SAndroid Build Coastguard Worker        ),
*da0073e9SAndroid Build Coastguard Worker    ]