from transformers import AutoTokenizer
from datasets import load_dataset
from itertools import chain
model_name = "gpt2"
dataset_path = "wikitext"
dataset_name = "wikitext-2-raw-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
raw_dataset = load_dataset(dataset_path, dataset_name)
if "validation" not in raw_dataset.keys():
raw_dataset["validation"] = load_dataset(
dataset_path,
dataset_name,
split=f"train[:5%]",
)
column_names = list(raw_dataset["train"].features)
text_column_name = "text" if "text" in column_names else column_names[0]
max_seq_length = tokenizer.model_max_length
def tokenize_function(examples):
return tokenizer(examples[text_column_name])
tokenized_datasets = raw_dataset.map(
tokenize_function,
batched=True,
remove_columns=column_names,
load_from_cache_file=True,
)
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {
k: list(chain(*examples[k])) for k in examples.keys()
}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
if total_length >= max_seq_length:
total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len.
result = {
k: [
t[i : i + max_seq_length]
for i in range(0, total_length, max_seq_length)
]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
tokenized_datasets = tokenized_datasets.map(
group_texts, batched=True, load_from_cache_file=True
)
train_dataset = tokenized_datasets["train"]
from transformers import AutoConfig, AutoModelForCausalLM
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForCausalLM.from_config(config)
from transformers.utils.fx import symbolic_trace
input_names = list(next(iter(train_dataset)).keys())
traced = symbolic_trace(model, input_names = input_names)
from typing import List, Type
from transformers import PretrainedConfig
def get_split_points(config: Type[PretrainedConfig]) -> List[str]:
split_points: List[str] = []
# config: GPT2Config for GPT2 model
for i in range(config.num_hidden_layers):
split_points.append(f"transformer.h.{i}")
split_points.append("transformer.ln_f")
return split_points
split_points = [p.replace(".", "_") for p in get_split_points(config)]
import torch.fx
from typing import List, Dict, Tuple, Optional
def _split_nodes(
traced: torch.fx.GraphModule, split_points: List[str]
) -> Tuple[Dict[str, int], Dict[int, List[str]]]:
node_name_to_shard_id: Dict[str, int] = {}
shard_id = 0
nodes_so_far: List[torch.fx.Node] = []
extra_outputs: Dict[int, List[str]] = {}
for node in traced.graph.nodes:
if node.op in [
"placeholder",
"get_attr",
"call_function",
"call_method",
"call_module",
]:
node_name_to_shard_id[node.name] = shard_id
nodes_so_far.append(node)
point = next(filter(lambda p: node.name.startswith(p), split_points), None)
if point:
# Record outputs that should be used later.
# they will be added in return of this shard.
outputs = []
for node in nodes_so_far:
for user in node.users.keys():
if user.name not in node_name_to_shard_id:
outputs.append(node.name)
# Remove duplicate
extra_outputs[shard_id] = list(dict.fromkeys(outputs).keys())
shard_id += 1
split_points.remove(point)
elif node.op == "output":
break
assert len(split_points) == 0, "Sharding is not complete."
return node_name_to_shard_id, extra_outputs
def shard_model(
model: torch.nn.Module,
traced: torch.fx.GraphModule,
split_points: List[str]
) -> List[torch.fx.GraphModule]:
module_list: List[torch.fx.GraphModule] = []
node_name_to_shard_id, extra_outputs = _split_nodes(traced, split_points)
prev_shard_id = 1000
prev_node: Optional[torch.fx.Node] = None
env: Dict[str, torch.fx.Node] = {}
new_graph = torch.fx.Graph()
# Iterate all nodes
for node in traced.graph.nodes:
if node.name in node_name_to_shard_id:
current_shard_id = node_name_to_shard_id[node.name]
if prev_shard_id < current_shard_id:
assert prev_node
with new_graph.inserting_after(prev_node):
if prev_shard_id in extra_outputs:
outputs = extra_outputs[prev_shard_id]
outputs = tuple([env[i] for i in outputs])
new_graph.output(outputs)
else:
outputs = tuple(env[prev_node.name])
new_graph.output(outputs)
# finalize this graph into GraphModule list
new_graph.lint()
module_list.append(torch.fx.GraphModule(model, new_graph))
# Create a new graph
new_graph = torch.fx.Graph()
for output in outputs:
# Add all nodes in return of the previous graph to its input
node_name = env[output.name].name
pl_node = new_graph.create_node("placeholder", node_name)
env[node_name] = pl_node
# Cut is done. Add all nodes into the current graph
if node.op in [
"placeholder",
"get_attr",
"call_function",
"call_method",
"call_module",
]:
# Copy the nodes from the existing graph to the new graph.
new_node = new_graph.node_copy(node, lambda x: env[x.name])
env[node.name] = new_node
elif node.op == "output":
# If this is the last node, we should add an output node and add the last graph to the list.
assert prev_node, "prev_node cannot be None"
with new_graph.inserting_after(prev_node):
new_node = new_graph.node_copy(node, lambda x: env[x.name])
new_graph.lint()
module_list.append(torch.fx.GraphModule(model, new_graph))
break
prev_node = new_node
prev_shard_id = node_name_to_shard_id[node.name]
return module_list
results = shard_model(model, traced, split_points)
[result.print_readable() for result in results]
class GraphModule(torch.nn.Module): def forward(self, input_ids : torch.Tensor, attention_mask : torch.Tensor, labels : torch.Tensor): # No stacktrace found for following nodes size = input_ids.size() getitem = size[-1] view = input_ids.view(-1, getitem); input_ids = getitem = None size_1 = view.size() getitem_1 = size_1[0]; size_1 = None getitem_2 = size[-1] add = getitem_2 + 0; getitem_2 = None getattr_1 = view.device arange = torch.arange(0, add, dtype = torch.int64, device = getattr_1); add = getattr_1 = None unsqueeze = arange.unsqueeze(0); arange = None getitem_3 = size[-1] view_1 = unsqueeze.view(-1, getitem_3); unsqueeze = getitem_3 = None le = getitem_1 <= 0 view_2 = attention_mask.view(getitem_1, -1); attention_mask = getitem_1 = None getitem_4 = view_2[(slice(None, None, None), None, None, slice(None, None, None))]; view_2 = None to = getitem_4.to(dtype = torch.float32); getitem_4 = None sub = 1.0 - to; to = None mul = sub * -3.4028234663852886e+38; sub = None transformer_wte = self.transformer.wte(view); view = None transformer_wpe = self.transformer.wpe(view_1); view_1 = None add_1 = transformer_wte + transformer_wpe; transformer_wte = transformer_wpe = None transformer_drop = self.transformer.drop(add_1); add_1 = None size_2 = transformer_drop.size(-1) add_2 = size + (size_2,); size = size_2 = None transformer_h_0_ln_1 = getattr(self.transformer.h, "0").ln_1(transformer_drop) return (labels, mul, transformer_drop, add_2, transformer_h_0_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, transformer_drop, add_2, transformer_h_0_ln_1): # No stacktrace found for following nodes size_3 = transformer_h_0_ln_1.size() getitem_5 = size_3[slice(None, -1, None)]; size_3 = None add_3 = getitem_5 + (2304,); getitem_5 = None transformer_h_0_attn_c_attn_bias = getattr(self.transformer.h, "0").attn.c_attn.bias size_4 = transformer_h_0_ln_1.size(-1) view_3 = transformer_h_0_ln_1.view(-1, size_4); transformer_h_0_ln_1 = size_4 = None transformer_h_0_attn_c_attn_weight = getattr(self.transformer.h, "0").attn.c_attn.weight addmm = torch.addmm(transformer_h_0_attn_c_attn_bias, view_3, transformer_h_0_attn_c_attn_weight); transformer_h_0_attn_c_attn_bias = view_3 = transformer_h_0_attn_c_attn_weight = None view_4 = addmm.view(add_3); addmm = add_3 = None split = view_4.split(768, dim = 2); view_4 = None getitem_6 = split[0] getitem_7 = split[1] getitem_8 = split[2]; split = None size_5 = getitem_6.size() getitem_9 = size_5[slice(None, -1, None)]; size_5 = None add_4 = getitem_9 + (12, 64); getitem_9 = None view_5 = getitem_6.view(add_4); getitem_6 = add_4 = None permute = view_5.permute(0, 2, 1, 3); view_5 = None size_6 = getitem_7.size() getitem_10 = size_6[slice(None, -1, None)]; size_6 = None add_5 = getitem_10 + (12, 64); getitem_10 = None view_6 = getitem_7.view(add_5); getitem_7 = add_5 = None permute_1 = view_6.permute(0, 2, 1, 3); view_6 = None size_7 = getitem_8.size() getitem_11 = size_7[slice(None, -1, None)]; size_7 = None add_6 = getitem_11 + (12, 64); getitem_11 = None view_7 = getitem_8.view(add_6); getitem_8 = add_6 = None permute_2 = view_7.permute(0, 2, 1, 3); view_7 = None transpose = permute_1.transpose(-1, -2) matmul = torch.matmul(permute, transpose); transpose = None size_8 = permute_2.size(-1) pow_1 = size_8 ** 0.5; size_8 = None getattr_2 = matmul.dtype getattr_3 = matmul.device full = torch.full([], pow_1, dtype = getattr_2, device = getattr_3); pow_1 = getattr_2 = getattr_3 = None truediv = matmul / full; matmul = full = None size_9 = permute.size(-2); permute = None size_10 = permute_1.size(-2) transformer_h_0_attn_bias = getattr(self.transformer.h, "0").attn.bias sub_1 = size_10 - size_9; size_9 = None getitem_12 = transformer_h_0_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_1, size_10, None), slice(None, size_10, None))]; transformer_h_0_attn_bias = sub_1 = size_10 = None getattr_4 = truediv.dtype finfo = torch.finfo(getattr_4); getattr_4 = None getattr_5 = finfo.min; finfo = None getattr_6 = truediv.dtype full_1 = torch.full([], getattr_5, dtype = getattr_6); getattr_5 = getattr_6 = None getattr_7 = truediv.device to_1 = full_1.to(getattr_7); full_1 = getattr_7 = None getattr_8 = truediv.dtype to_2 = truediv.to(getattr_8); truediv = getattr_8 = None where = torch.where(getitem_12, to_2, to_1); getitem_12 = to_2 = to_1 = None add_7 = where + mul; where = None softmax = torch.nn.functional.softmax(add_7, dim = -1, _stacklevel = 3, dtype = None); add_7 = None getattr_9 = permute_2.dtype type_1 = softmax.type(getattr_9); softmax = getattr_9 = None transformer_h_0_attn_attn_dropout = getattr(self.transformer.h, "0").attn.attn_dropout(type_1); type_1 = None matmul_1 = torch.matmul(transformer_h_0_attn_attn_dropout, permute_2); transformer_h_0_attn_attn_dropout = None permute_3 = matmul_1.permute(0, 2, 1, 3); matmul_1 = None contiguous = permute_3.contiguous(); permute_3 = None size_11 = contiguous.size() getitem_13 = size_11[slice(None, -2, None)]; size_11 = None add_8 = getitem_13 + (768,); getitem_13 = None view_8 = contiguous.view(add_8); contiguous = add_8 = None size_12 = view_8.size() getitem_14 = size_12[slice(None, -1, None)]; size_12 = None add_9 = getitem_14 + (768,); getitem_14 = None transformer_h_0_attn_c_proj_bias = getattr(self.transformer.h, "0").attn.c_proj.bias size_13 = view_8.size(-1) view_9 = view_8.view(-1, size_13); view_8 = size_13 = None transformer_h_0_attn_c_proj_weight = getattr(self.transformer.h, "0").attn.c_proj.weight addmm_1 = torch.addmm(transformer_h_0_attn_c_proj_bias, view_9, transformer_h_0_attn_c_proj_weight); transformer_h_0_attn_c_proj_bias = view_9 = transformer_h_0_attn_c_proj_weight = None view_10 = addmm_1.view(add_9); addmm_1 = add_9 = None transformer_h_0_attn_resid_dropout = getattr(self.transformer.h, "0").attn.resid_dropout(view_10); view_10 = None add_10 = transformer_h_0_attn_resid_dropout + transformer_drop; transformer_h_0_attn_resid_dropout = transformer_drop = None transformer_h_0_ln_2 = getattr(self.transformer.h, "0").ln_2(add_10) size_14 = transformer_h_0_ln_2.size() getitem_15 = size_14[slice(None, -1, None)]; size_14 = None add_11 = getitem_15 + (3072,); getitem_15 = None transformer_h_0_mlp_c_fc_bias = getattr(self.transformer.h, "0").mlp.c_fc.bias size_15 = transformer_h_0_ln_2.size(-1) view_11 = transformer_h_0_ln_2.view(-1, size_15); transformer_h_0_ln_2 = size_15 = None transformer_h_0_mlp_c_fc_weight = getattr(self.transformer.h, "0").mlp.c_fc.weight addmm_2 = torch.addmm(transformer_h_0_mlp_c_fc_bias, view_11, transformer_h_0_mlp_c_fc_weight); transformer_h_0_mlp_c_fc_bias = view_11 = transformer_h_0_mlp_c_fc_weight = None view_12 = addmm_2.view(add_11); addmm_2 = add_11 = None mul_1 = 0.5 * view_12 pow_2 = torch.pow(view_12, 3.0) mul_2 = 0.044715 * pow_2; pow_2 = None add_12 = view_12 + mul_2; view_12 = mul_2 = None mul_3 = 0.7978845608028654 * add_12; add_12 = None tanh = torch.tanh(mul_3); mul_3 = None add_13 = 1.0 + tanh; tanh = None mul_4 = mul_1 * add_13; mul_1 = add_13 = None size_16 = mul_4.size() getitem_16 = size_16[slice(None, -1, None)]; size_16 = None add_14 = getitem_16 + (768,); getitem_16 = None transformer_h_0_mlp_c_proj_bias = getattr(self.transformer.h, "0").mlp.c_proj.bias size_17 = mul_4.size(-1) view_13 = mul_4.view(-1, size_17); mul_4 = size_17 = None transformer_h_0_mlp_c_proj_weight = getattr(self.transformer.h, "0").mlp.c_proj.weight addmm_3 = torch.addmm(transformer_h_0_mlp_c_proj_bias, view_13, transformer_h_0_mlp_c_proj_weight); transformer_h_0_mlp_c_proj_bias = view_13 = transformer_h_0_mlp_c_proj_weight = None view_14 = addmm_3.view(add_14); addmm_3 = add_14 = None transformer_h_0_mlp_dropout = getattr(self.transformer.h, "0").mlp.dropout(view_14); view_14 = None add_15 = add_10 + transformer_h_0_mlp_dropout; add_10 = transformer_h_0_mlp_dropout = None transformer_h_1_ln_1 = getattr(self.transformer.h, "1").ln_1(add_15) return (labels, mul, add_2, permute_1, permute_2, add_15, transformer_h_1_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, add_15, transformer_h_1_ln_1): # No stacktrace found for following nodes size_18 = transformer_h_1_ln_1.size() getitem_17 = size_18[slice(None, -1, None)]; size_18 = None add_16 = getitem_17 + (2304,); getitem_17 = None transformer_h_1_attn_c_attn_bias = getattr(self.transformer.h, "1").attn.c_attn.bias size_19 = transformer_h_1_ln_1.size(-1) view_15 = transformer_h_1_ln_1.view(-1, size_19); transformer_h_1_ln_1 = size_19 = None transformer_h_1_attn_c_attn_weight = getattr(self.transformer.h, "1").attn.c_attn.weight addmm_4 = torch.addmm(transformer_h_1_attn_c_attn_bias, view_15, transformer_h_1_attn_c_attn_weight); transformer_h_1_attn_c_attn_bias = view_15 = transformer_h_1_attn_c_attn_weight = None view_16 = addmm_4.view(add_16); addmm_4 = add_16 = None split_1 = view_16.split(768, dim = 2); view_16 = None getitem_18 = split_1[0] getitem_19 = split_1[1] getitem_20 = split_1[2]; split_1 = None size_20 = getitem_18.size() getitem_21 = size_20[slice(None, -1, None)]; size_20 = None add_17 = getitem_21 + (12, 64); getitem_21 = None view_17 = getitem_18.view(add_17); getitem_18 = add_17 = None permute_4 = view_17.permute(0, 2, 1, 3); view_17 = None size_21 = getitem_19.size() getitem_22 = size_21[slice(None, -1, None)]; size_21 = None add_18 = getitem_22 + (12, 64); getitem_22 = None view_18 = getitem_19.view(add_18); getitem_19 = add_18 = None permute_5 = view_18.permute(0, 2, 1, 3); view_18 = None size_22 = getitem_20.size() getitem_23 = size_22[slice(None, -1, None)]; size_22 = None add_19 = getitem_23 + (12, 64); getitem_23 = None view_19 = getitem_20.view(add_19); getitem_20 = add_19 = None permute_6 = view_19.permute(0, 2, 1, 3); view_19 = None transpose_1 = permute_5.transpose(-1, -2) matmul_2 = torch.matmul(permute_4, transpose_1); transpose_1 = None size_23 = permute_6.size(-1) pow_3 = size_23 ** 0.5; size_23 = None getattr_10 = matmul_2.dtype getattr_11 = matmul_2.device full_2 = torch.full([], pow_3, dtype = getattr_10, device = getattr_11); pow_3 = getattr_10 = getattr_11 = None truediv_1 = matmul_2 / full_2; matmul_2 = full_2 = None size_24 = permute_4.size(-2); permute_4 = None size_25 = permute_5.size(-2) transformer_h_1_attn_bias = getattr(self.transformer.h, "1").attn.bias sub_2 = size_25 - size_24; size_24 = None getitem_24 = transformer_h_1_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_2, size_25, None), slice(None, size_25, None))]; transformer_h_1_attn_bias = sub_2 = size_25 = None getattr_12 = truediv_1.dtype finfo_1 = torch.finfo(getattr_12); getattr_12 = None getattr_13 = finfo_1.min; finfo_1 = None getattr_14 = truediv_1.dtype full_3 = torch.full([], getattr_13, dtype = getattr_14); getattr_13 = getattr_14 = None getattr_15 = truediv_1.device to_3 = full_3.to(getattr_15); full_3 = getattr_15 = None getattr_16 = truediv_1.dtype to_4 = truediv_1.to(getattr_16); truediv_1 = getattr_16 = None where_1 = torch.where(getitem_24, to_4, to_3); getitem_24 = to_4 = to_3 = None add_20 = where_1 + mul; where_1 = None softmax_1 = torch.nn.functional.softmax(add_20, dim = -1, _stacklevel = 3, dtype = None); add_20 = None getattr_17 = permute_6.dtype type_2 = softmax_1.type(getattr_17); softmax_1 = getattr_17 = None transformer_h_1_attn_attn_dropout = getattr(self.transformer.h, "1").attn.attn_dropout(type_2); type_2 = None matmul_3 = torch.matmul(transformer_h_1_attn_attn_dropout, permute_6); transformer_h_1_attn_attn_dropout = None permute_7 = matmul_3.permute(0, 2, 1, 3); matmul_3 = None contiguous_1 = permute_7.contiguous(); permute_7 = None size_26 = contiguous_1.size() getitem_25 = size_26[slice(None, -2, None)]; size_26 = None add_21 = getitem_25 + (768,); getitem_25 = None view_20 = contiguous_1.view(add_21); contiguous_1 = add_21 = None size_27 = view_20.size() getitem_26 = size_27[slice(None, -1, None)]; size_27 = None add_22 = getitem_26 + (768,); getitem_26 = None transformer_h_1_attn_c_proj_bias = getattr(self.transformer.h, "1").attn.c_proj.bias size_28 = view_20.size(-1) view_21 = view_20.view(-1, size_28); view_20 = size_28 = None transformer_h_1_attn_c_proj_weight = getattr(self.transformer.h, "1").attn.c_proj.weight addmm_5 = torch.addmm(transformer_h_1_attn_c_proj_bias, view_21, transformer_h_1_attn_c_proj_weight); transformer_h_1_attn_c_proj_bias = view_21 = transformer_h_1_attn_c_proj_weight = None view_22 = addmm_5.view(add_22); addmm_5 = add_22 = None transformer_h_1_attn_resid_dropout = getattr(self.transformer.h, "1").attn.resid_dropout(view_22); view_22 = None add_23 = transformer_h_1_attn_resid_dropout + add_15; transformer_h_1_attn_resid_dropout = add_15 = None transformer_h_1_ln_2 = getattr(self.transformer.h, "1").ln_2(add_23) size_29 = transformer_h_1_ln_2.size() getitem_27 = size_29[slice(None, -1, None)]; size_29 = None add_24 = getitem_27 + (3072,); getitem_27 = None transformer_h_1_mlp_c_fc_bias = getattr(self.transformer.h, "1").mlp.c_fc.bias size_30 = transformer_h_1_ln_2.size(-1) view_23 = transformer_h_1_ln_2.view(-1, size_30); transformer_h_1_ln_2 = size_30 = None transformer_h_1_mlp_c_fc_weight = getattr(self.transformer.h, "1").mlp.c_fc.weight addmm_6 = torch.addmm(transformer_h_1_mlp_c_fc_bias, view_23, transformer_h_1_mlp_c_fc_weight); transformer_h_1_mlp_c_fc_bias = view_23 = transformer_h_1_mlp_c_fc_weight = None view_24 = addmm_6.view(add_24); addmm_6 = add_24 = None mul_5 = 0.5 * view_24 pow_4 = torch.pow(view_24, 3.0) mul_6 = 0.044715 * pow_4; pow_4 = None add_25 = view_24 + mul_6; view_24 = mul_6 = None mul_7 = 0.7978845608028654 * add_25; add_25 = None tanh_1 = torch.tanh(mul_7); mul_7 = None add_26 = 1.0 + tanh_1; tanh_1 = None mul_8 = mul_5 * add_26; mul_5 = add_26 = None size_31 = mul_8.size() getitem_28 = size_31[slice(None, -1, None)]; size_31 = None add_27 = getitem_28 + (768,); getitem_28 = None transformer_h_1_mlp_c_proj_bias = getattr(self.transformer.h, "1").mlp.c_proj.bias size_32 = mul_8.size(-1) view_25 = mul_8.view(-1, size_32); mul_8 = size_32 = None transformer_h_1_mlp_c_proj_weight = getattr(self.transformer.h, "1").mlp.c_proj.weight addmm_7 = torch.addmm(transformer_h_1_mlp_c_proj_bias, view_25, transformer_h_1_mlp_c_proj_weight); transformer_h_1_mlp_c_proj_bias = view_25 = transformer_h_1_mlp_c_proj_weight = None view_26 = addmm_7.view(add_27); addmm_7 = add_27 = None transformer_h_1_mlp_dropout = getattr(self.transformer.h, "1").mlp.dropout(view_26); view_26 = None add_28 = add_23 + transformer_h_1_mlp_dropout; add_23 = transformer_h_1_mlp_dropout = None transformer_h_2_ln_1 = getattr(self.transformer.h, "2").ln_1(add_28) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, add_28, transformer_h_2_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, add_28, transformer_h_2_ln_1): # No stacktrace found for following nodes size_33 = transformer_h_2_ln_1.size() getitem_29 = size_33[slice(None, -1, None)]; size_33 = None add_29 = getitem_29 + (2304,); getitem_29 = None transformer_h_2_attn_c_attn_bias = getattr(self.transformer.h, "2").attn.c_attn.bias size_34 = transformer_h_2_ln_1.size(-1) view_27 = transformer_h_2_ln_1.view(-1, size_34); transformer_h_2_ln_1 = size_34 = None transformer_h_2_attn_c_attn_weight = getattr(self.transformer.h, "2").attn.c_attn.weight addmm_8 = torch.addmm(transformer_h_2_attn_c_attn_bias, view_27, transformer_h_2_attn_c_attn_weight); transformer_h_2_attn_c_attn_bias = view_27 = transformer_h_2_attn_c_attn_weight = None view_28 = addmm_8.view(add_29); addmm_8 = add_29 = None split_2 = view_28.split(768, dim = 2); view_28 = None getitem_30 = split_2[0] getitem_31 = split_2[1] getitem_32 = split_2[2]; split_2 = None size_35 = getitem_30.size() getitem_33 = size_35[slice(None, -1, None)]; size_35 = None add_30 = getitem_33 + (12, 64); getitem_33 = None view_29 = getitem_30.view(add_30); getitem_30 = add_30 = None permute_8 = view_29.permute(0, 2, 1, 3); view_29 = None size_36 = getitem_31.size() getitem_34 = size_36[slice(None, -1, None)]; size_36 = None add_31 = getitem_34 + (12, 64); getitem_34 = None view_30 = getitem_31.view(add_31); getitem_31 = add_31 = None permute_9 = view_30.permute(0, 2, 1, 3); view_30 = None size_37 = getitem_32.size() getitem_35 = size_37[slice(None, -1, None)]; size_37 = None add_32 = getitem_35 + (12, 64); getitem_35 = None view_31 = getitem_32.view(add_32); getitem_32 = add_32 = None permute_10 = view_31.permute(0, 2, 1, 3); view_31 = None transpose_2 = permute_9.transpose(-1, -2) matmul_4 = torch.matmul(permute_8, transpose_2); transpose_2 = None size_38 = permute_10.size(-1) pow_5 = size_38 ** 0.5; size_38 = None getattr_18 = matmul_4.dtype getattr_19 = matmul_4.device full_4 = torch.full([], pow_5, dtype = getattr_18, device = getattr_19); pow_5 = getattr_18 = getattr_19 = None truediv_2 = matmul_4 / full_4; matmul_4 = full_4 = None size_39 = permute_8.size(-2); permute_8 = None size_40 = permute_9.size(-2) transformer_h_2_attn_bias = getattr(self.transformer.h, "2").attn.bias sub_3 = size_40 - size_39; size_39 = None getitem_36 = transformer_h_2_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_3, size_40, None), slice(None, size_40, None))]; transformer_h_2_attn_bias = sub_3 = size_40 = None getattr_20 = truediv_2.dtype finfo_2 = torch.finfo(getattr_20); getattr_20 = None getattr_21 = finfo_2.min; finfo_2 = None getattr_22 = truediv_2.dtype full_5 = torch.full([], getattr_21, dtype = getattr_22); getattr_21 = getattr_22 = None getattr_23 = truediv_2.device to_5 = full_5.to(getattr_23); full_5 = getattr_23 = None getattr_24 = truediv_2.dtype to_6 = truediv_2.to(getattr_24); truediv_2 = getattr_24 = None where_2 = torch.where(getitem_36, to_6, to_5); getitem_36 = to_6 = to_5 = None add_33 = where_2 + mul; where_2 = None softmax_2 = torch.nn.functional.softmax(add_33, dim = -1, _stacklevel = 3, dtype = None); add_33 = None getattr_25 = permute_10.dtype type_3 = softmax_2.type(getattr_25); softmax_2 = getattr_25 = None transformer_h_2_attn_attn_dropout = getattr(self.transformer.h, "2").attn.attn_dropout(type_3); type_3 = None matmul_5 = torch.matmul(transformer_h_2_attn_attn_dropout, permute_10); transformer_h_2_attn_attn_dropout = None permute_11 = matmul_5.permute(0, 2, 1, 3); matmul_5 = None contiguous_2 = permute_11.contiguous(); permute_11 = None size_41 = contiguous_2.size() getitem_37 = size_41[slice(None, -2, None)]; size_41 = None add_34 = getitem_37 + (768,); getitem_37 = None view_32 = contiguous_2.view(add_34); contiguous_2 = add_34 = None size_42 = view_32.size() getitem_38 = size_42[slice(None, -1, None)]; size_42 = None add_35 = getitem_38 + (768,); getitem_38 = None transformer_h_2_attn_c_proj_bias = getattr(self.transformer.h, "2").attn.c_proj.bias size_43 = view_32.size(-1) view_33 = view_32.view(-1, size_43); view_32 = size_43 = None transformer_h_2_attn_c_proj_weight = getattr(self.transformer.h, "2").attn.c_proj.weight addmm_9 = torch.addmm(transformer_h_2_attn_c_proj_bias, view_33, transformer_h_2_attn_c_proj_weight); transformer_h_2_attn_c_proj_bias = view_33 = transformer_h_2_attn_c_proj_weight = None view_34 = addmm_9.view(add_35); addmm_9 = add_35 = None transformer_h_2_attn_resid_dropout = getattr(self.transformer.h, "2").attn.resid_dropout(view_34); view_34 = None add_36 = transformer_h_2_attn_resid_dropout + add_28; transformer_h_2_attn_resid_dropout = add_28 = None transformer_h_2_ln_2 = getattr(self.transformer.h, "2").ln_2(add_36) size_44 = transformer_h_2_ln_2.size() getitem_39 = size_44[slice(None, -1, None)]; size_44 = None add_37 = getitem_39 + (3072,); getitem_39 = None transformer_h_2_mlp_c_fc_bias = getattr(self.transformer.h, "2").mlp.c_fc.bias size_45 = transformer_h_2_ln_2.size(-1) view_35 = transformer_h_2_ln_2.view(-1, size_45); transformer_h_2_ln_2 = size_45 = None transformer_h_2_mlp_c_fc_weight = getattr(self.transformer.h, "2").mlp.c_fc.weight addmm_10 = torch.addmm(transformer_h_2_mlp_c_fc_bias, view_35, transformer_h_2_mlp_c_fc_weight); transformer_h_2_mlp_c_fc_bias = view_35 = transformer_h_2_mlp_c_fc_weight = None view_36 = addmm_10.view(add_37); addmm_10 = add_37 = None mul_9 = 0.5 * view_36 pow_6 = torch.pow(view_36, 3.0) mul_10 = 0.044715 * pow_6; pow_6 = None add_38 = view_36 + mul_10; view_36 = mul_10 = None mul_11 = 0.7978845608028654 * add_38; add_38 = None tanh_2 = torch.tanh(mul_11); mul_11 = None add_39 = 1.0 + tanh_2; tanh_2 = None mul_12 = mul_9 * add_39; mul_9 = add_39 = None size_46 = mul_12.size() getitem_40 = size_46[slice(None, -1, None)]; size_46 = None add_40 = getitem_40 + (768,); getitem_40 = None transformer_h_2_mlp_c_proj_bias = getattr(self.transformer.h, "2").mlp.c_proj.bias size_47 = mul_12.size(-1) view_37 = mul_12.view(-1, size_47); mul_12 = size_47 = None transformer_h_2_mlp_c_proj_weight = getattr(self.transformer.h, "2").mlp.c_proj.weight addmm_11 = torch.addmm(transformer_h_2_mlp_c_proj_bias, view_37, transformer_h_2_mlp_c_proj_weight); transformer_h_2_mlp_c_proj_bias = view_37 = transformer_h_2_mlp_c_proj_weight = None view_38 = addmm_11.view(add_40); addmm_11 = add_40 = None transformer_h_2_mlp_dropout = getattr(self.transformer.h, "2").mlp.dropout(view_38); view_38 = None add_41 = add_36 + transformer_h_2_mlp_dropout; add_36 = transformer_h_2_mlp_dropout = None transformer_h_3_ln_1 = getattr(self.transformer.h, "3").ln_1(add_41) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, add_41, transformer_h_3_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, add_41, transformer_h_3_ln_1): # No stacktrace found for following nodes size_48 = transformer_h_3_ln_1.size() getitem_41 = size_48[slice(None, -1, None)]; size_48 = None add_42 = getitem_41 + (2304,); getitem_41 = None transformer_h_3_attn_c_attn_bias = getattr(self.transformer.h, "3").attn.c_attn.bias size_49 = transformer_h_3_ln_1.size(-1) view_39 = transformer_h_3_ln_1.view(-1, size_49); transformer_h_3_ln_1 = size_49 = None transformer_h_3_attn_c_attn_weight = getattr(self.transformer.h, "3").attn.c_attn.weight addmm_12 = torch.addmm(transformer_h_3_attn_c_attn_bias, view_39, transformer_h_3_attn_c_attn_weight); transformer_h_3_attn_c_attn_bias = view_39 = transformer_h_3_attn_c_attn_weight = None view_40 = addmm_12.view(add_42); addmm_12 = add_42 = None split_3 = view_40.split(768, dim = 2); view_40 = None getitem_42 = split_3[0] getitem_43 = split_3[1] getitem_44 = split_3[2]; split_3 = None size_50 = getitem_42.size() getitem_45 = size_50[slice(None, -1, None)]; size_50 = None add_43 = getitem_45 + (12, 64); getitem_45 = None view_41 = getitem_42.view(add_43); getitem_42 = add_43 = None permute_12 = view_41.permute(0, 2, 1, 3); view_41 = None size_51 = getitem_43.size() getitem_46 = size_51[slice(None, -1, None)]; size_51 = None add_44 = getitem_46 + (12, 64); getitem_46 = None view_42 = getitem_43.view(add_44); getitem_43 = add_44 = None permute_13 = view_42.permute(0, 2, 1, 3); view_42 = None size_52 = getitem_44.size() getitem_47 = size_52[slice(None, -1, None)]; size_52 = None add_45 = getitem_47 + (12, 64); getitem_47 = None view_43 = getitem_44.view(add_45); getitem_44 = add_45 = None permute_14 = view_43.permute(0, 2, 1, 3); view_43 = None transpose_3 = permute_13.transpose(-1, -2) matmul_6 = torch.matmul(permute_12, transpose_3); transpose_3 = None size_53 = permute_14.size(-1) pow_7 = size_53 ** 0.5; size_53 = None getattr_26 = matmul_6.dtype getattr_27 = matmul_6.device full_6 = torch.full([], pow_7, dtype = getattr_26, device = getattr_27); pow_7 = getattr_26 = getattr_27 = None truediv_3 = matmul_6 / full_6; matmul_6 = full_6 = None size_54 = permute_12.size(-2); permute_12 = None size_55 = permute_13.size(-2) transformer_h_3_attn_bias = getattr(self.transformer.h, "3").attn.bias sub_4 = size_55 - size_54; size_54 = None getitem_48 = transformer_h_3_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_4, size_55, None), slice(None, size_55, None))]; transformer_h_3_attn_bias = sub_4 = size_55 = None getattr_28 = truediv_3.dtype finfo_3 = torch.finfo(getattr_28); getattr_28 = None getattr_29 = finfo_3.min; finfo_3 = None getattr_30 = truediv_3.dtype full_7 = torch.full([], getattr_29, dtype = getattr_30); getattr_29 = getattr_30 = None getattr_31 = truediv_3.device to_7 = full_7.to(getattr_31); full_7 = getattr_31 = None getattr_32 = truediv_3.dtype to_8 = truediv_3.to(getattr_32); truediv_3 = getattr_32 = None where_3 = torch.where(getitem_48, to_8, to_7); getitem_48 = to_8 = to_7 = None add_46 = where_3 + mul; where_3 = None softmax_3 = torch.nn.functional.softmax(add_46, dim = -1, _stacklevel = 3, dtype = None); add_46 = None getattr_33 = permute_14.dtype type_4 = softmax_3.type(getattr_33); softmax_3 = getattr_33 = None transformer_h_3_attn_attn_dropout = getattr(self.transformer.h, "3").attn.attn_dropout(type_4); type_4 = None matmul_7 = torch.matmul(transformer_h_3_attn_attn_dropout, permute_14); transformer_h_3_attn_attn_dropout = None permute_15 = matmul_7.permute(0, 2, 1, 3); matmul_7 = None contiguous_3 = permute_15.contiguous(); permute_15 = None size_56 = contiguous_3.size() getitem_49 = size_56[slice(None, -2, None)]; size_56 = None add_47 = getitem_49 + (768,); getitem_49 = None view_44 = contiguous_3.view(add_47); contiguous_3 = add_47 = None size_57 = view_44.size() getitem_50 = size_57[slice(None, -1, None)]; size_57 = None add_48 = getitem_50 + (768,); getitem_50 = None transformer_h_3_attn_c_proj_bias = getattr(self.transformer.h, "3").attn.c_proj.bias size_58 = view_44.size(-1) view_45 = view_44.view(-1, size_58); view_44 = size_58 = None transformer_h_3_attn_c_proj_weight = getattr(self.transformer.h, "3").attn.c_proj.weight addmm_13 = torch.addmm(transformer_h_3_attn_c_proj_bias, view_45, transformer_h_3_attn_c_proj_weight); transformer_h_3_attn_c_proj_bias = view_45 = transformer_h_3_attn_c_proj_weight = None view_46 = addmm_13.view(add_48); addmm_13 = add_48 = None transformer_h_3_attn_resid_dropout = getattr(self.transformer.h, "3").attn.resid_dropout(view_46); view_46 = None add_49 = transformer_h_3_attn_resid_dropout + add_41; transformer_h_3_attn_resid_dropout = add_41 = None transformer_h_3_ln_2 = getattr(self.transformer.h, "3").ln_2(add_49) size_59 = transformer_h_3_ln_2.size() getitem_51 = size_59[slice(None, -1, None)]; size_59 = None add_50 = getitem_51 + (3072,); getitem_51 = None transformer_h_3_mlp_c_fc_bias = getattr(self.transformer.h, "3").mlp.c_fc.bias size_60 = transformer_h_3_ln_2.size(-1) view_47 = transformer_h_3_ln_2.view(-1, size_60); transformer_h_3_ln_2 = size_60 = None transformer_h_3_mlp_c_fc_weight = getattr(self.transformer.h, "3").mlp.c_fc.weight addmm_14 = torch.addmm(transformer_h_3_mlp_c_fc_bias, view_47, transformer_h_3_mlp_c_fc_weight); transformer_h_3_mlp_c_fc_bias = view_47 = transformer_h_3_mlp_c_fc_weight = None view_48 = addmm_14.view(add_50); addmm_14 = add_50 = None mul_13 = 0.5 * view_48 pow_8 = torch.pow(view_48, 3.0) mul_14 = 0.044715 * pow_8; pow_8 = None add_51 = view_48 + mul_14; view_48 = mul_14 = None mul_15 = 0.7978845608028654 * add_51; add_51 = None tanh_3 = torch.tanh(mul_15); mul_15 = None add_52 = 1.0 + tanh_3; tanh_3 = None mul_16 = mul_13 * add_52; mul_13 = add_52 = None size_61 = mul_16.size() getitem_52 = size_61[slice(None, -1, None)]; size_61 = None add_53 = getitem_52 + (768,); getitem_52 = None transformer_h_3_mlp_c_proj_bias = getattr(self.transformer.h, "3").mlp.c_proj.bias size_62 = mul_16.size(-1) view_49 = mul_16.view(-1, size_62); mul_16 = size_62 = None transformer_h_3_mlp_c_proj_weight = getattr(self.transformer.h, "3").mlp.c_proj.weight addmm_15 = torch.addmm(transformer_h_3_mlp_c_proj_bias, view_49, transformer_h_3_mlp_c_proj_weight); transformer_h_3_mlp_c_proj_bias = view_49 = transformer_h_3_mlp_c_proj_weight = None view_50 = addmm_15.view(add_53); addmm_15 = add_53 = None transformer_h_3_mlp_dropout = getattr(self.transformer.h, "3").mlp.dropout(view_50); view_50 = None add_54 = add_49 + transformer_h_3_mlp_dropout; add_49 = transformer_h_3_mlp_dropout = None transformer_h_4_ln_1 = getattr(self.transformer.h, "4").ln_1(add_54) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, add_54, transformer_h_4_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, add_54, transformer_h_4_ln_1): # No stacktrace found for following nodes size_63 = transformer_h_4_ln_1.size() getitem_53 = size_63[slice(None, -1, None)]; size_63 = None add_55 = getitem_53 + (2304,); getitem_53 = None transformer_h_4_attn_c_attn_bias = getattr(self.transformer.h, "4").attn.c_attn.bias size_64 = transformer_h_4_ln_1.size(-1) view_51 = transformer_h_4_ln_1.view(-1, size_64); transformer_h_4_ln_1 = size_64 = None transformer_h_4_attn_c_attn_weight = getattr(self.transformer.h, "4").attn.c_attn.weight addmm_16 = torch.addmm(transformer_h_4_attn_c_attn_bias, view_51, transformer_h_4_attn_c_attn_weight); transformer_h_4_attn_c_attn_bias = view_51 = transformer_h_4_attn_c_attn_weight = None view_52 = addmm_16.view(add_55); addmm_16 = add_55 = None split_4 = view_52.split(768, dim = 2); view_52 = None getitem_54 = split_4[0] getitem_55 = split_4[1] getitem_56 = split_4[2]; split_4 = None size_65 = getitem_54.size() getitem_57 = size_65[slice(None, -1, None)]; size_65 = None add_56 = getitem_57 + (12, 64); getitem_57 = None view_53 = getitem_54.view(add_56); getitem_54 = add_56 = None permute_16 = view_53.permute(0, 2, 1, 3); view_53 = None size_66 = getitem_55.size() getitem_58 = size_66[slice(None, -1, None)]; size_66 = None add_57 = getitem_58 + (12, 64); getitem_58 = None view_54 = getitem_55.view(add_57); getitem_55 = add_57 = None permute_17 = view_54.permute(0, 2, 1, 3); view_54 = None size_67 = getitem_56.size() getitem_59 = size_67[slice(None, -1, None)]; size_67 = None add_58 = getitem_59 + (12, 64); getitem_59 = None view_55 = getitem_56.view(add_58); getitem_56 = add_58 = None permute_18 = view_55.permute(0, 2, 1, 3); view_55 = None transpose_4 = permute_17.transpose(-1, -2) matmul_8 = torch.matmul(permute_16, transpose_4); transpose_4 = None size_68 = permute_18.size(-1) pow_9 = size_68 ** 0.5; size_68 = None getattr_34 = matmul_8.dtype getattr_35 = matmul_8.device full_8 = torch.full([], pow_9, dtype = getattr_34, device = getattr_35); pow_9 = getattr_34 = getattr_35 = None truediv_4 = matmul_8 / full_8; matmul_8 = full_8 = None size_69 = permute_16.size(-2); permute_16 = None size_70 = permute_17.size(-2) transformer_h_4_attn_bias = getattr(self.transformer.h, "4").attn.bias sub_5 = size_70 - size_69; size_69 = None getitem_60 = transformer_h_4_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_5, size_70, None), slice(None, size_70, None))]; transformer_h_4_attn_bias = sub_5 = size_70 = None getattr_36 = truediv_4.dtype finfo_4 = torch.finfo(getattr_36); getattr_36 = None getattr_37 = finfo_4.min; finfo_4 = None getattr_38 = truediv_4.dtype full_9 = torch.full([], getattr_37, dtype = getattr_38); getattr_37 = getattr_38 = None getattr_39 = truediv_4.device to_9 = full_9.to(getattr_39); full_9 = getattr_39 = None getattr_40 = truediv_4.dtype to_10 = truediv_4.to(getattr_40); truediv_4 = getattr_40 = None where_4 = torch.where(getitem_60, to_10, to_9); getitem_60 = to_10 = to_9 = None add_59 = where_4 + mul; where_4 = None softmax_4 = torch.nn.functional.softmax(add_59, dim = -1, _stacklevel = 3, dtype = None); add_59 = None getattr_41 = permute_18.dtype type_5 = softmax_4.type(getattr_41); softmax_4 = getattr_41 = None transformer_h_4_attn_attn_dropout = getattr(self.transformer.h, "4").attn.attn_dropout(type_5); type_5 = None matmul_9 = torch.matmul(transformer_h_4_attn_attn_dropout, permute_18); transformer_h_4_attn_attn_dropout = None permute_19 = matmul_9.permute(0, 2, 1, 3); matmul_9 = None contiguous_4 = permute_19.contiguous(); permute_19 = None size_71 = contiguous_4.size() getitem_61 = size_71[slice(None, -2, None)]; size_71 = None add_60 = getitem_61 + (768,); getitem_61 = None view_56 = contiguous_4.view(add_60); contiguous_4 = add_60 = None size_72 = view_56.size() getitem_62 = size_72[slice(None, -1, None)]; size_72 = None add_61 = getitem_62 + (768,); getitem_62 = None transformer_h_4_attn_c_proj_bias = getattr(self.transformer.h, "4").attn.c_proj.bias size_73 = view_56.size(-1) view_57 = view_56.view(-1, size_73); view_56 = size_73 = None transformer_h_4_attn_c_proj_weight = getattr(self.transformer.h, "4").attn.c_proj.weight addmm_17 = torch.addmm(transformer_h_4_attn_c_proj_bias, view_57, transformer_h_4_attn_c_proj_weight); transformer_h_4_attn_c_proj_bias = view_57 = transformer_h_4_attn_c_proj_weight = None view_58 = addmm_17.view(add_61); addmm_17 = add_61 = None transformer_h_4_attn_resid_dropout = getattr(self.transformer.h, "4").attn.resid_dropout(view_58); view_58 = None add_62 = transformer_h_4_attn_resid_dropout + add_54; transformer_h_4_attn_resid_dropout = add_54 = None transformer_h_4_ln_2 = getattr(self.transformer.h, "4").ln_2(add_62) size_74 = transformer_h_4_ln_2.size() getitem_63 = size_74[slice(None, -1, None)]; size_74 = None add_63 = getitem_63 + (3072,); getitem_63 = None transformer_h_4_mlp_c_fc_bias = getattr(self.transformer.h, "4").mlp.c_fc.bias size_75 = transformer_h_4_ln_2.size(-1) view_59 = transformer_h_4_ln_2.view(-1, size_75); transformer_h_4_ln_2 = size_75 = None transformer_h_4_mlp_c_fc_weight = getattr(self.transformer.h, "4").mlp.c_fc.weight addmm_18 = torch.addmm(transformer_h_4_mlp_c_fc_bias, view_59, transformer_h_4_mlp_c_fc_weight); transformer_h_4_mlp_c_fc_bias = view_59 = transformer_h_4_mlp_c_fc_weight = None view_60 = addmm_18.view(add_63); addmm_18 = add_63 = None mul_17 = 0.5 * view_60 pow_10 = torch.pow(view_60, 3.0) mul_18 = 0.044715 * pow_10; pow_10 = None add_64 = view_60 + mul_18; view_60 = mul_18 = None mul_19 = 0.7978845608028654 * add_64; add_64 = None tanh_4 = torch.tanh(mul_19); mul_19 = None add_65 = 1.0 + tanh_4; tanh_4 = None mul_20 = mul_17 * add_65; mul_17 = add_65 = None size_76 = mul_20.size() getitem_64 = size_76[slice(None, -1, None)]; size_76 = None add_66 = getitem_64 + (768,); getitem_64 = None transformer_h_4_mlp_c_proj_bias = getattr(self.transformer.h, "4").mlp.c_proj.bias size_77 = mul_20.size(-1) view_61 = mul_20.view(-1, size_77); mul_20 = size_77 = None transformer_h_4_mlp_c_proj_weight = getattr(self.transformer.h, "4").mlp.c_proj.weight addmm_19 = torch.addmm(transformer_h_4_mlp_c_proj_bias, view_61, transformer_h_4_mlp_c_proj_weight); transformer_h_4_mlp_c_proj_bias = view_61 = transformer_h_4_mlp_c_proj_weight = None view_62 = addmm_19.view(add_66); addmm_19 = add_66 = None transformer_h_4_mlp_dropout = getattr(self.transformer.h, "4").mlp.dropout(view_62); view_62 = None add_67 = add_62 + transformer_h_4_mlp_dropout; add_62 = transformer_h_4_mlp_dropout = None transformer_h_5_ln_1 = getattr(self.transformer.h, "5").ln_1(add_67) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, add_67, transformer_h_5_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, add_67, transformer_h_5_ln_1): # No stacktrace found for following nodes size_78 = transformer_h_5_ln_1.size() getitem_65 = size_78[slice(None, -1, None)]; size_78 = None add_68 = getitem_65 + (2304,); getitem_65 = None transformer_h_5_attn_c_attn_bias = getattr(self.transformer.h, "5").attn.c_attn.bias size_79 = transformer_h_5_ln_1.size(-1) view_63 = transformer_h_5_ln_1.view(-1, size_79); transformer_h_5_ln_1 = size_79 = None transformer_h_5_attn_c_attn_weight = getattr(self.transformer.h, "5").attn.c_attn.weight addmm_20 = torch.addmm(transformer_h_5_attn_c_attn_bias, view_63, transformer_h_5_attn_c_attn_weight); transformer_h_5_attn_c_attn_bias = view_63 = transformer_h_5_attn_c_attn_weight = None view_64 = addmm_20.view(add_68); addmm_20 = add_68 = None split_5 = view_64.split(768, dim = 2); view_64 = None getitem_66 = split_5[0] getitem_67 = split_5[1] getitem_68 = split_5[2]; split_5 = None size_80 = getitem_66.size() getitem_69 = size_80[slice(None, -1, None)]; size_80 = None add_69 = getitem_69 + (12, 64); getitem_69 = None view_65 = getitem_66.view(add_69); getitem_66 = add_69 = None permute_20 = view_65.permute(0, 2, 1, 3); view_65 = None size_81 = getitem_67.size() getitem_70 = size_81[slice(None, -1, None)]; size_81 = None add_70 = getitem_70 + (12, 64); getitem_70 = None view_66 = getitem_67.view(add_70); getitem_67 = add_70 = None permute_21 = view_66.permute(0, 2, 1, 3); view_66 = None size_82 = getitem_68.size() getitem_71 = size_82[slice(None, -1, None)]; size_82 = None add_71 = getitem_71 + (12, 64); getitem_71 = None view_67 = getitem_68.view(add_71); getitem_68 = add_71 = None permute_22 = view_67.permute(0, 2, 1, 3); view_67 = None transpose_5 = permute_21.transpose(-1, -2) matmul_10 = torch.matmul(permute_20, transpose_5); transpose_5 = None size_83 = permute_22.size(-1) pow_11 = size_83 ** 0.5; size_83 = None getattr_42 = matmul_10.dtype getattr_43 = matmul_10.device full_10 = torch.full([], pow_11, dtype = getattr_42, device = getattr_43); pow_11 = getattr_42 = getattr_43 = None truediv_5 = matmul_10 / full_10; matmul_10 = full_10 = None size_84 = permute_20.size(-2); permute_20 = None size_85 = permute_21.size(-2) transformer_h_5_attn_bias = getattr(self.transformer.h, "5").attn.bias sub_6 = size_85 - size_84; size_84 = None getitem_72 = transformer_h_5_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_6, size_85, None), slice(None, size_85, None))]; transformer_h_5_attn_bias = sub_6 = size_85 = None getattr_44 = truediv_5.dtype finfo_5 = torch.finfo(getattr_44); getattr_44 = None getattr_45 = finfo_5.min; finfo_5 = None getattr_46 = truediv_5.dtype full_11 = torch.full([], getattr_45, dtype = getattr_46); getattr_45 = getattr_46 = None getattr_47 = truediv_5.device to_11 = full_11.to(getattr_47); full_11 = getattr_47 = None getattr_48 = truediv_5.dtype to_12 = truediv_5.to(getattr_48); truediv_5 = getattr_48 = None where_5 = torch.where(getitem_72, to_12, to_11); getitem_72 = to_12 = to_11 = None add_72 = where_5 + mul; where_5 = None softmax_5 = torch.nn.functional.softmax(add_72, dim = -1, _stacklevel = 3, dtype = None); add_72 = None getattr_49 = permute_22.dtype type_6 = softmax_5.type(getattr_49); softmax_5 = getattr_49 = None transformer_h_5_attn_attn_dropout = getattr(self.transformer.h, "5").attn.attn_dropout(type_6); type_6 = None matmul_11 = torch.matmul(transformer_h_5_attn_attn_dropout, permute_22); transformer_h_5_attn_attn_dropout = None permute_23 = matmul_11.permute(0, 2, 1, 3); matmul_11 = None contiguous_5 = permute_23.contiguous(); permute_23 = None size_86 = contiguous_5.size() getitem_73 = size_86[slice(None, -2, None)]; size_86 = None add_73 = getitem_73 + (768,); getitem_73 = None view_68 = contiguous_5.view(add_73); contiguous_5 = add_73 = None size_87 = view_68.size() getitem_74 = size_87[slice(None, -1, None)]; size_87 = None add_74 = getitem_74 + (768,); getitem_74 = None transformer_h_5_attn_c_proj_bias = getattr(self.transformer.h, "5").attn.c_proj.bias size_88 = view_68.size(-1) view_69 = view_68.view(-1, size_88); view_68 = size_88 = None transformer_h_5_attn_c_proj_weight = getattr(self.transformer.h, "5").attn.c_proj.weight addmm_21 = torch.addmm(transformer_h_5_attn_c_proj_bias, view_69, transformer_h_5_attn_c_proj_weight); transformer_h_5_attn_c_proj_bias = view_69 = transformer_h_5_attn_c_proj_weight = None view_70 = addmm_21.view(add_74); addmm_21 = add_74 = None transformer_h_5_attn_resid_dropout = getattr(self.transformer.h, "5").attn.resid_dropout(view_70); view_70 = None add_75 = transformer_h_5_attn_resid_dropout + add_67; transformer_h_5_attn_resid_dropout = add_67 = None transformer_h_5_ln_2 = getattr(self.transformer.h, "5").ln_2(add_75) size_89 = transformer_h_5_ln_2.size() getitem_75 = size_89[slice(None, -1, None)]; size_89 = None add_76 = getitem_75 + (3072,); getitem_75 = None transformer_h_5_mlp_c_fc_bias = getattr(self.transformer.h, "5").mlp.c_fc.bias size_90 = transformer_h_5_ln_2.size(-1) view_71 = transformer_h_5_ln_2.view(-1, size_90); transformer_h_5_ln_2 = size_90 = None transformer_h_5_mlp_c_fc_weight = getattr(self.transformer.h, "5").mlp.c_fc.weight addmm_22 = torch.addmm(transformer_h_5_mlp_c_fc_bias, view_71, transformer_h_5_mlp_c_fc_weight); transformer_h_5_mlp_c_fc_bias = view_71 = transformer_h_5_mlp_c_fc_weight = None view_72 = addmm_22.view(add_76); addmm_22 = add_76 = None mul_21 = 0.5 * view_72 pow_12 = torch.pow(view_72, 3.0) mul_22 = 0.044715 * pow_12; pow_12 = None add_77 = view_72 + mul_22; view_72 = mul_22 = None mul_23 = 0.7978845608028654 * add_77; add_77 = None tanh_5 = torch.tanh(mul_23); mul_23 = None add_78 = 1.0 + tanh_5; tanh_5 = None mul_24 = mul_21 * add_78; mul_21 = add_78 = None size_91 = mul_24.size() getitem_76 = size_91[slice(None, -1, None)]; size_91 = None add_79 = getitem_76 + (768,); getitem_76 = None transformer_h_5_mlp_c_proj_bias = getattr(self.transformer.h, "5").mlp.c_proj.bias size_92 = mul_24.size(-1) view_73 = mul_24.view(-1, size_92); mul_24 = size_92 = None transformer_h_5_mlp_c_proj_weight = getattr(self.transformer.h, "5").mlp.c_proj.weight addmm_23 = torch.addmm(transformer_h_5_mlp_c_proj_bias, view_73, transformer_h_5_mlp_c_proj_weight); transformer_h_5_mlp_c_proj_bias = view_73 = transformer_h_5_mlp_c_proj_weight = None view_74 = addmm_23.view(add_79); addmm_23 = add_79 = None transformer_h_5_mlp_dropout = getattr(self.transformer.h, "5").mlp.dropout(view_74); view_74 = None add_80 = add_75 + transformer_h_5_mlp_dropout; add_75 = transformer_h_5_mlp_dropout = None transformer_h_6_ln_1 = getattr(self.transformer.h, "6").ln_1(add_80) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, add_80, transformer_h_6_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, add_80, transformer_h_6_ln_1): # No stacktrace found for following nodes size_93 = transformer_h_6_ln_1.size() getitem_77 = size_93[slice(None, -1, None)]; size_93 = None add_81 = getitem_77 + (2304,); getitem_77 = None transformer_h_6_attn_c_attn_bias = getattr(self.transformer.h, "6").attn.c_attn.bias size_94 = transformer_h_6_ln_1.size(-1) view_75 = transformer_h_6_ln_1.view(-1, size_94); transformer_h_6_ln_1 = size_94 = None transformer_h_6_attn_c_attn_weight = getattr(self.transformer.h, "6").attn.c_attn.weight addmm_24 = torch.addmm(transformer_h_6_attn_c_attn_bias, view_75, transformer_h_6_attn_c_attn_weight); transformer_h_6_attn_c_attn_bias = view_75 = transformer_h_6_attn_c_attn_weight = None view_76 = addmm_24.view(add_81); addmm_24 = add_81 = None split_6 = view_76.split(768, dim = 2); view_76 = None getitem_78 = split_6[0] getitem_79 = split_6[1] getitem_80 = split_6[2]; split_6 = None size_95 = getitem_78.size() getitem_81 = size_95[slice(None, -1, None)]; size_95 = None add_82 = getitem_81 + (12, 64); getitem_81 = None view_77 = getitem_78.view(add_82); getitem_78 = add_82 = None permute_24 = view_77.permute(0, 2, 1, 3); view_77 = None size_96 = getitem_79.size() getitem_82 = size_96[slice(None, -1, None)]; size_96 = None add_83 = getitem_82 + (12, 64); getitem_82 = None view_78 = getitem_79.view(add_83); getitem_79 = add_83 = None permute_25 = view_78.permute(0, 2, 1, 3); view_78 = None size_97 = getitem_80.size() getitem_83 = size_97[slice(None, -1, None)]; size_97 = None add_84 = getitem_83 + (12, 64); getitem_83 = None view_79 = getitem_80.view(add_84); getitem_80 = add_84 = None permute_26 = view_79.permute(0, 2, 1, 3); view_79 = None transpose_6 = permute_25.transpose(-1, -2) matmul_12 = torch.matmul(permute_24, transpose_6); transpose_6 = None size_98 = permute_26.size(-1) pow_13 = size_98 ** 0.5; size_98 = None getattr_50 = matmul_12.dtype getattr_51 = matmul_12.device full_12 = torch.full([], pow_13, dtype = getattr_50, device = getattr_51); pow_13 = getattr_50 = getattr_51 = None truediv_6 = matmul_12 / full_12; matmul_12 = full_12 = None size_99 = permute_24.size(-2); permute_24 = None size_100 = permute_25.size(-2) transformer_h_6_attn_bias = getattr(self.transformer.h, "6").attn.bias sub_7 = size_100 - size_99; size_99 = None getitem_84 = transformer_h_6_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_7, size_100, None), slice(None, size_100, None))]; transformer_h_6_attn_bias = sub_7 = size_100 = None getattr_52 = truediv_6.dtype finfo_6 = torch.finfo(getattr_52); getattr_52 = None getattr_53 = finfo_6.min; finfo_6 = None getattr_54 = truediv_6.dtype full_13 = torch.full([], getattr_53, dtype = getattr_54); getattr_53 = getattr_54 = None getattr_55 = truediv_6.device to_13 = full_13.to(getattr_55); full_13 = getattr_55 = None getattr_56 = truediv_6.dtype to_14 = truediv_6.to(getattr_56); truediv_6 = getattr_56 = None where_6 = torch.where(getitem_84, to_14, to_13); getitem_84 = to_14 = to_13 = None add_85 = where_6 + mul; where_6 = None softmax_6 = torch.nn.functional.softmax(add_85, dim = -1, _stacklevel = 3, dtype = None); add_85 = None getattr_57 = permute_26.dtype type_7 = softmax_6.type(getattr_57); softmax_6 = getattr_57 = None transformer_h_6_attn_attn_dropout = getattr(self.transformer.h, "6").attn.attn_dropout(type_7); type_7 = None matmul_13 = torch.matmul(transformer_h_6_attn_attn_dropout, permute_26); transformer_h_6_attn_attn_dropout = None permute_27 = matmul_13.permute(0, 2, 1, 3); matmul_13 = None contiguous_6 = permute_27.contiguous(); permute_27 = None size_101 = contiguous_6.size() getitem_85 = size_101[slice(None, -2, None)]; size_101 = None add_86 = getitem_85 + (768,); getitem_85 = None view_80 = contiguous_6.view(add_86); contiguous_6 = add_86 = None size_102 = view_80.size() getitem_86 = size_102[slice(None, -1, None)]; size_102 = None add_87 = getitem_86 + (768,); getitem_86 = None transformer_h_6_attn_c_proj_bias = getattr(self.transformer.h, "6").attn.c_proj.bias size_103 = view_80.size(-1) view_81 = view_80.view(-1, size_103); view_80 = size_103 = None transformer_h_6_attn_c_proj_weight = getattr(self.transformer.h, "6").attn.c_proj.weight addmm_25 = torch.addmm(transformer_h_6_attn_c_proj_bias, view_81, transformer_h_6_attn_c_proj_weight); transformer_h_6_attn_c_proj_bias = view_81 = transformer_h_6_attn_c_proj_weight = None view_82 = addmm_25.view(add_87); addmm_25 = add_87 = None transformer_h_6_attn_resid_dropout = getattr(self.transformer.h, "6").attn.resid_dropout(view_82); view_82 = None add_88 = transformer_h_6_attn_resid_dropout + add_80; transformer_h_6_attn_resid_dropout = add_80 = None transformer_h_6_ln_2 = getattr(self.transformer.h, "6").ln_2(add_88) size_104 = transformer_h_6_ln_2.size() getitem_87 = size_104[slice(None, -1, None)]; size_104 = None add_89 = getitem_87 + (3072,); getitem_87 = None transformer_h_6_mlp_c_fc_bias = getattr(self.transformer.h, "6").mlp.c_fc.bias size_105 = transformer_h_6_ln_2.size(-1) view_83 = transformer_h_6_ln_2.view(-1, size_105); transformer_h_6_ln_2 = size_105 = None transformer_h_6_mlp_c_fc_weight = getattr(self.transformer.h, "6").mlp.c_fc.weight addmm_26 = torch.addmm(transformer_h_6_mlp_c_fc_bias, view_83, transformer_h_6_mlp_c_fc_weight); transformer_h_6_mlp_c_fc_bias = view_83 = transformer_h_6_mlp_c_fc_weight = None view_84 = addmm_26.view(add_89); addmm_26 = add_89 = None mul_25 = 0.5 * view_84 pow_14 = torch.pow(view_84, 3.0) mul_26 = 0.044715 * pow_14; pow_14 = None add_90 = view_84 + mul_26; view_84 = mul_26 = None mul_27 = 0.7978845608028654 * add_90; add_90 = None tanh_6 = torch.tanh(mul_27); mul_27 = None add_91 = 1.0 + tanh_6; tanh_6 = None mul_28 = mul_25 * add_91; mul_25 = add_91 = None size_106 = mul_28.size() getitem_88 = size_106[slice(None, -1, None)]; size_106 = None add_92 = getitem_88 + (768,); getitem_88 = None transformer_h_6_mlp_c_proj_bias = getattr(self.transformer.h, "6").mlp.c_proj.bias size_107 = mul_28.size(-1) view_85 = mul_28.view(-1, size_107); mul_28 = size_107 = None transformer_h_6_mlp_c_proj_weight = getattr(self.transformer.h, "6").mlp.c_proj.weight addmm_27 = torch.addmm(transformer_h_6_mlp_c_proj_bias, view_85, transformer_h_6_mlp_c_proj_weight); transformer_h_6_mlp_c_proj_bias = view_85 = transformer_h_6_mlp_c_proj_weight = None view_86 = addmm_27.view(add_92); addmm_27 = add_92 = None transformer_h_6_mlp_dropout = getattr(self.transformer.h, "6").mlp.dropout(view_86); view_86 = None add_93 = add_88 + transformer_h_6_mlp_dropout; add_88 = transformer_h_6_mlp_dropout = None transformer_h_7_ln_1 = getattr(self.transformer.h, "7").ln_1(add_93) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, add_93, transformer_h_7_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, add_93, transformer_h_7_ln_1): # No stacktrace found for following nodes size_108 = transformer_h_7_ln_1.size() getitem_89 = size_108[slice(None, -1, None)]; size_108 = None add_94 = getitem_89 + (2304,); getitem_89 = None transformer_h_7_attn_c_attn_bias = getattr(self.transformer.h, "7").attn.c_attn.bias size_109 = transformer_h_7_ln_1.size(-1) view_87 = transformer_h_7_ln_1.view(-1, size_109); transformer_h_7_ln_1 = size_109 = None transformer_h_7_attn_c_attn_weight = getattr(self.transformer.h, "7").attn.c_attn.weight addmm_28 = torch.addmm(transformer_h_7_attn_c_attn_bias, view_87, transformer_h_7_attn_c_attn_weight); transformer_h_7_attn_c_attn_bias = view_87 = transformer_h_7_attn_c_attn_weight = None view_88 = addmm_28.view(add_94); addmm_28 = add_94 = None split_7 = view_88.split(768, dim = 2); view_88 = None getitem_90 = split_7[0] getitem_91 = split_7[1] getitem_92 = split_7[2]; split_7 = None size_110 = getitem_90.size() getitem_93 = size_110[slice(None, -1, None)]; size_110 = None add_95 = getitem_93 + (12, 64); getitem_93 = None view_89 = getitem_90.view(add_95); getitem_90 = add_95 = None permute_28 = view_89.permute(0, 2, 1, 3); view_89 = None size_111 = getitem_91.size() getitem_94 = size_111[slice(None, -1, None)]; size_111 = None add_96 = getitem_94 + (12, 64); getitem_94 = None view_90 = getitem_91.view(add_96); getitem_91 = add_96 = None permute_29 = view_90.permute(0, 2, 1, 3); view_90 = None size_112 = getitem_92.size() getitem_95 = size_112[slice(None, -1, None)]; size_112 = None add_97 = getitem_95 + (12, 64); getitem_95 = None view_91 = getitem_92.view(add_97); getitem_92 = add_97 = None permute_30 = view_91.permute(0, 2, 1, 3); view_91 = None transpose_7 = permute_29.transpose(-1, -2) matmul_14 = torch.matmul(permute_28, transpose_7); transpose_7 = None size_113 = permute_30.size(-1) pow_15 = size_113 ** 0.5; size_113 = None getattr_58 = matmul_14.dtype getattr_59 = matmul_14.device full_14 = torch.full([], pow_15, dtype = getattr_58, device = getattr_59); pow_15 = getattr_58 = getattr_59 = None truediv_7 = matmul_14 / full_14; matmul_14 = full_14 = None size_114 = permute_28.size(-2); permute_28 = None size_115 = permute_29.size(-2) transformer_h_7_attn_bias = getattr(self.transformer.h, "7").attn.bias sub_8 = size_115 - size_114; size_114 = None getitem_96 = transformer_h_7_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_8, size_115, None), slice(None, size_115, None))]; transformer_h_7_attn_bias = sub_8 = size_115 = None getattr_60 = truediv_7.dtype finfo_7 = torch.finfo(getattr_60); getattr_60 = None getattr_61 = finfo_7.min; finfo_7 = None getattr_62 = truediv_7.dtype full_15 = torch.full([], getattr_61, dtype = getattr_62); getattr_61 = getattr_62 = None getattr_63 = truediv_7.device to_15 = full_15.to(getattr_63); full_15 = getattr_63 = None getattr_64 = truediv_7.dtype to_16 = truediv_7.to(getattr_64); truediv_7 = getattr_64 = None where_7 = torch.where(getitem_96, to_16, to_15); getitem_96 = to_16 = to_15 = None add_98 = where_7 + mul; where_7 = None softmax_7 = torch.nn.functional.softmax(add_98, dim = -1, _stacklevel = 3, dtype = None); add_98 = None getattr_65 = permute_30.dtype type_8 = softmax_7.type(getattr_65); softmax_7 = getattr_65 = None transformer_h_7_attn_attn_dropout = getattr(self.transformer.h, "7").attn.attn_dropout(type_8); type_8 = None matmul_15 = torch.matmul(transformer_h_7_attn_attn_dropout, permute_30); transformer_h_7_attn_attn_dropout = None permute_31 = matmul_15.permute(0, 2, 1, 3); matmul_15 = None contiguous_7 = permute_31.contiguous(); permute_31 = None size_116 = contiguous_7.size() getitem_97 = size_116[slice(None, -2, None)]; size_116 = None add_99 = getitem_97 + (768,); getitem_97 = None view_92 = contiguous_7.view(add_99); contiguous_7 = add_99 = None size_117 = view_92.size() getitem_98 = size_117[slice(None, -1, None)]; size_117 = None add_100 = getitem_98 + (768,); getitem_98 = None transformer_h_7_attn_c_proj_bias = getattr(self.transformer.h, "7").attn.c_proj.bias size_118 = view_92.size(-1) view_93 = view_92.view(-1, size_118); view_92 = size_118 = None transformer_h_7_attn_c_proj_weight = getattr(self.transformer.h, "7").attn.c_proj.weight addmm_29 = torch.addmm(transformer_h_7_attn_c_proj_bias, view_93, transformer_h_7_attn_c_proj_weight); transformer_h_7_attn_c_proj_bias = view_93 = transformer_h_7_attn_c_proj_weight = None view_94 = addmm_29.view(add_100); addmm_29 = add_100 = None transformer_h_7_attn_resid_dropout = getattr(self.transformer.h, "7").attn.resid_dropout(view_94); view_94 = None add_101 = transformer_h_7_attn_resid_dropout + add_93; transformer_h_7_attn_resid_dropout = add_93 = None transformer_h_7_ln_2 = getattr(self.transformer.h, "7").ln_2(add_101) size_119 = transformer_h_7_ln_2.size() getitem_99 = size_119[slice(None, -1, None)]; size_119 = None add_102 = getitem_99 + (3072,); getitem_99 = None transformer_h_7_mlp_c_fc_bias = getattr(self.transformer.h, "7").mlp.c_fc.bias size_120 = transformer_h_7_ln_2.size(-1) view_95 = transformer_h_7_ln_2.view(-1, size_120); transformer_h_7_ln_2 = size_120 = None transformer_h_7_mlp_c_fc_weight = getattr(self.transformer.h, "7").mlp.c_fc.weight addmm_30 = torch.addmm(transformer_h_7_mlp_c_fc_bias, view_95, transformer_h_7_mlp_c_fc_weight); transformer_h_7_mlp_c_fc_bias = view_95 = transformer_h_7_mlp_c_fc_weight = None view_96 = addmm_30.view(add_102); addmm_30 = add_102 = None mul_29 = 0.5 * view_96 pow_16 = torch.pow(view_96, 3.0) mul_30 = 0.044715 * pow_16; pow_16 = None add_103 = view_96 + mul_30; view_96 = mul_30 = None mul_31 = 0.7978845608028654 * add_103; add_103 = None tanh_7 = torch.tanh(mul_31); mul_31 = None add_104 = 1.0 + tanh_7; tanh_7 = None mul_32 = mul_29 * add_104; mul_29 = add_104 = None size_121 = mul_32.size() getitem_100 = size_121[slice(None, -1, None)]; size_121 = None add_105 = getitem_100 + (768,); getitem_100 = None transformer_h_7_mlp_c_proj_bias = getattr(self.transformer.h, "7").mlp.c_proj.bias size_122 = mul_32.size(-1) view_97 = mul_32.view(-1, size_122); mul_32 = size_122 = None transformer_h_7_mlp_c_proj_weight = getattr(self.transformer.h, "7").mlp.c_proj.weight addmm_31 = torch.addmm(transformer_h_7_mlp_c_proj_bias, view_97, transformer_h_7_mlp_c_proj_weight); transformer_h_7_mlp_c_proj_bias = view_97 = transformer_h_7_mlp_c_proj_weight = None view_98 = addmm_31.view(add_105); addmm_31 = add_105 = None transformer_h_7_mlp_dropout = getattr(self.transformer.h, "7").mlp.dropout(view_98); view_98 = None add_106 = add_101 + transformer_h_7_mlp_dropout; add_101 = transformer_h_7_mlp_dropout = None transformer_h_8_ln_1 = getattr(self.transformer.h, "8").ln_1(add_106) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, add_106, transformer_h_8_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, add_106, transformer_h_8_ln_1): # No stacktrace found for following nodes size_123 = transformer_h_8_ln_1.size() getitem_101 = size_123[slice(None, -1, None)]; size_123 = None add_107 = getitem_101 + (2304,); getitem_101 = None transformer_h_8_attn_c_attn_bias = getattr(self.transformer.h, "8").attn.c_attn.bias size_124 = transformer_h_8_ln_1.size(-1) view_99 = transformer_h_8_ln_1.view(-1, size_124); transformer_h_8_ln_1 = size_124 = None transformer_h_8_attn_c_attn_weight = getattr(self.transformer.h, "8").attn.c_attn.weight addmm_32 = torch.addmm(transformer_h_8_attn_c_attn_bias, view_99, transformer_h_8_attn_c_attn_weight); transformer_h_8_attn_c_attn_bias = view_99 = transformer_h_8_attn_c_attn_weight = None view_100 = addmm_32.view(add_107); addmm_32 = add_107 = None split_8 = view_100.split(768, dim = 2); view_100 = None getitem_102 = split_8[0] getitem_103 = split_8[1] getitem_104 = split_8[2]; split_8 = None size_125 = getitem_102.size() getitem_105 = size_125[slice(None, -1, None)]; size_125 = None add_108 = getitem_105 + (12, 64); getitem_105 = None view_101 = getitem_102.view(add_108); getitem_102 = add_108 = None permute_32 = view_101.permute(0, 2, 1, 3); view_101 = None size_126 = getitem_103.size() getitem_106 = size_126[slice(None, -1, None)]; size_126 = None add_109 = getitem_106 + (12, 64); getitem_106 = None view_102 = getitem_103.view(add_109); getitem_103 = add_109 = None permute_33 = view_102.permute(0, 2, 1, 3); view_102 = None size_127 = getitem_104.size() getitem_107 = size_127[slice(None, -1, None)]; size_127 = None add_110 = getitem_107 + (12, 64); getitem_107 = None view_103 = getitem_104.view(add_110); getitem_104 = add_110 = None permute_34 = view_103.permute(0, 2, 1, 3); view_103 = None transpose_8 = permute_33.transpose(-1, -2) matmul_16 = torch.matmul(permute_32, transpose_8); transpose_8 = None size_128 = permute_34.size(-1) pow_17 = size_128 ** 0.5; size_128 = None getattr_66 = matmul_16.dtype getattr_67 = matmul_16.device full_16 = torch.full([], pow_17, dtype = getattr_66, device = getattr_67); pow_17 = getattr_66 = getattr_67 = None truediv_8 = matmul_16 / full_16; matmul_16 = full_16 = None size_129 = permute_32.size(-2); permute_32 = None size_130 = permute_33.size(-2) transformer_h_8_attn_bias = getattr(self.transformer.h, "8").attn.bias sub_9 = size_130 - size_129; size_129 = None getitem_108 = transformer_h_8_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_9, size_130, None), slice(None, size_130, None))]; transformer_h_8_attn_bias = sub_9 = size_130 = None getattr_68 = truediv_8.dtype finfo_8 = torch.finfo(getattr_68); getattr_68 = None getattr_69 = finfo_8.min; finfo_8 = None getattr_70 = truediv_8.dtype full_17 = torch.full([], getattr_69, dtype = getattr_70); getattr_69 = getattr_70 = None getattr_71 = truediv_8.device to_17 = full_17.to(getattr_71); full_17 = getattr_71 = None getattr_72 = truediv_8.dtype to_18 = truediv_8.to(getattr_72); truediv_8 = getattr_72 = None where_8 = torch.where(getitem_108, to_18, to_17); getitem_108 = to_18 = to_17 = None add_111 = where_8 + mul; where_8 = None softmax_8 = torch.nn.functional.softmax(add_111, dim = -1, _stacklevel = 3, dtype = None); add_111 = None getattr_73 = permute_34.dtype type_9 = softmax_8.type(getattr_73); softmax_8 = getattr_73 = None transformer_h_8_attn_attn_dropout = getattr(self.transformer.h, "8").attn.attn_dropout(type_9); type_9 = None matmul_17 = torch.matmul(transformer_h_8_attn_attn_dropout, permute_34); transformer_h_8_attn_attn_dropout = None permute_35 = matmul_17.permute(0, 2, 1, 3); matmul_17 = None contiguous_8 = permute_35.contiguous(); permute_35 = None size_131 = contiguous_8.size() getitem_109 = size_131[slice(None, -2, None)]; size_131 = None add_112 = getitem_109 + (768,); getitem_109 = None view_104 = contiguous_8.view(add_112); contiguous_8 = add_112 = None size_132 = view_104.size() getitem_110 = size_132[slice(None, -1, None)]; size_132 = None add_113 = getitem_110 + (768,); getitem_110 = None transformer_h_8_attn_c_proj_bias = getattr(self.transformer.h, "8").attn.c_proj.bias size_133 = view_104.size(-1) view_105 = view_104.view(-1, size_133); view_104 = size_133 = None transformer_h_8_attn_c_proj_weight = getattr(self.transformer.h, "8").attn.c_proj.weight addmm_33 = torch.addmm(transformer_h_8_attn_c_proj_bias, view_105, transformer_h_8_attn_c_proj_weight); transformer_h_8_attn_c_proj_bias = view_105 = transformer_h_8_attn_c_proj_weight = None view_106 = addmm_33.view(add_113); addmm_33 = add_113 = None transformer_h_8_attn_resid_dropout = getattr(self.transformer.h, "8").attn.resid_dropout(view_106); view_106 = None add_114 = transformer_h_8_attn_resid_dropout + add_106; transformer_h_8_attn_resid_dropout = add_106 = None transformer_h_8_ln_2 = getattr(self.transformer.h, "8").ln_2(add_114) size_134 = transformer_h_8_ln_2.size() getitem_111 = size_134[slice(None, -1, None)]; size_134 = None add_115 = getitem_111 + (3072,); getitem_111 = None transformer_h_8_mlp_c_fc_bias = getattr(self.transformer.h, "8").mlp.c_fc.bias size_135 = transformer_h_8_ln_2.size(-1) view_107 = transformer_h_8_ln_2.view(-1, size_135); transformer_h_8_ln_2 = size_135 = None transformer_h_8_mlp_c_fc_weight = getattr(self.transformer.h, "8").mlp.c_fc.weight addmm_34 = torch.addmm(transformer_h_8_mlp_c_fc_bias, view_107, transformer_h_8_mlp_c_fc_weight); transformer_h_8_mlp_c_fc_bias = view_107 = transformer_h_8_mlp_c_fc_weight = None view_108 = addmm_34.view(add_115); addmm_34 = add_115 = None mul_33 = 0.5 * view_108 pow_18 = torch.pow(view_108, 3.0) mul_34 = 0.044715 * pow_18; pow_18 = None add_116 = view_108 + mul_34; view_108 = mul_34 = None mul_35 = 0.7978845608028654 * add_116; add_116 = None tanh_8 = torch.tanh(mul_35); mul_35 = None add_117 = 1.0 + tanh_8; tanh_8 = None mul_36 = mul_33 * add_117; mul_33 = add_117 = None size_136 = mul_36.size() getitem_112 = size_136[slice(None, -1, None)]; size_136 = None add_118 = getitem_112 + (768,); getitem_112 = None transformer_h_8_mlp_c_proj_bias = getattr(self.transformer.h, "8").mlp.c_proj.bias size_137 = mul_36.size(-1) view_109 = mul_36.view(-1, size_137); mul_36 = size_137 = None transformer_h_8_mlp_c_proj_weight = getattr(self.transformer.h, "8").mlp.c_proj.weight addmm_35 = torch.addmm(transformer_h_8_mlp_c_proj_bias, view_109, transformer_h_8_mlp_c_proj_weight); transformer_h_8_mlp_c_proj_bias = view_109 = transformer_h_8_mlp_c_proj_weight = None view_110 = addmm_35.view(add_118); addmm_35 = add_118 = None transformer_h_8_mlp_dropout = getattr(self.transformer.h, "8").mlp.dropout(view_110); view_110 = None add_119 = add_114 + transformer_h_8_mlp_dropout; add_114 = transformer_h_8_mlp_dropout = None transformer_h_9_ln_1 = getattr(self.transformer.h, "9").ln_1(add_119) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, add_119, transformer_h_9_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, add_119, transformer_h_9_ln_1): # No stacktrace found for following nodes size_138 = transformer_h_9_ln_1.size() getitem_113 = size_138[slice(None, -1, None)]; size_138 = None add_120 = getitem_113 + (2304,); getitem_113 = None transformer_h_9_attn_c_attn_bias = getattr(self.transformer.h, "9").attn.c_attn.bias size_139 = transformer_h_9_ln_1.size(-1) view_111 = transformer_h_9_ln_1.view(-1, size_139); transformer_h_9_ln_1 = size_139 = None transformer_h_9_attn_c_attn_weight = getattr(self.transformer.h, "9").attn.c_attn.weight addmm_36 = torch.addmm(transformer_h_9_attn_c_attn_bias, view_111, transformer_h_9_attn_c_attn_weight); transformer_h_9_attn_c_attn_bias = view_111 = transformer_h_9_attn_c_attn_weight = None view_112 = addmm_36.view(add_120); addmm_36 = add_120 = None split_9 = view_112.split(768, dim = 2); view_112 = None getitem_114 = split_9[0] getitem_115 = split_9[1] getitem_116 = split_9[2]; split_9 = None size_140 = getitem_114.size() getitem_117 = size_140[slice(None, -1, None)]; size_140 = None add_121 = getitem_117 + (12, 64); getitem_117 = None view_113 = getitem_114.view(add_121); getitem_114 = add_121 = None permute_36 = view_113.permute(0, 2, 1, 3); view_113 = None size_141 = getitem_115.size() getitem_118 = size_141[slice(None, -1, None)]; size_141 = None add_122 = getitem_118 + (12, 64); getitem_118 = None view_114 = getitem_115.view(add_122); getitem_115 = add_122 = None permute_37 = view_114.permute(0, 2, 1, 3); view_114 = None size_142 = getitem_116.size() getitem_119 = size_142[slice(None, -1, None)]; size_142 = None add_123 = getitem_119 + (12, 64); getitem_119 = None view_115 = getitem_116.view(add_123); getitem_116 = add_123 = None permute_38 = view_115.permute(0, 2, 1, 3); view_115 = None transpose_9 = permute_37.transpose(-1, -2) matmul_18 = torch.matmul(permute_36, transpose_9); transpose_9 = None size_143 = permute_38.size(-1) pow_19 = size_143 ** 0.5; size_143 = None getattr_74 = matmul_18.dtype getattr_75 = matmul_18.device full_18 = torch.full([], pow_19, dtype = getattr_74, device = getattr_75); pow_19 = getattr_74 = getattr_75 = None truediv_9 = matmul_18 / full_18; matmul_18 = full_18 = None size_144 = permute_36.size(-2); permute_36 = None size_145 = permute_37.size(-2) transformer_h_9_attn_bias = getattr(self.transformer.h, "9").attn.bias sub_10 = size_145 - size_144; size_144 = None getitem_120 = transformer_h_9_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_10, size_145, None), slice(None, size_145, None))]; transformer_h_9_attn_bias = sub_10 = size_145 = None getattr_76 = truediv_9.dtype finfo_9 = torch.finfo(getattr_76); getattr_76 = None getattr_77 = finfo_9.min; finfo_9 = None getattr_78 = truediv_9.dtype full_19 = torch.full([], getattr_77, dtype = getattr_78); getattr_77 = getattr_78 = None getattr_79 = truediv_9.device to_19 = full_19.to(getattr_79); full_19 = getattr_79 = None getattr_80 = truediv_9.dtype to_20 = truediv_9.to(getattr_80); truediv_9 = getattr_80 = None where_9 = torch.where(getitem_120, to_20, to_19); getitem_120 = to_20 = to_19 = None add_124 = where_9 + mul; where_9 = None softmax_9 = torch.nn.functional.softmax(add_124, dim = -1, _stacklevel = 3, dtype = None); add_124 = None getattr_81 = permute_38.dtype type_10 = softmax_9.type(getattr_81); softmax_9 = getattr_81 = None transformer_h_9_attn_attn_dropout = getattr(self.transformer.h, "9").attn.attn_dropout(type_10); type_10 = None matmul_19 = torch.matmul(transformer_h_9_attn_attn_dropout, permute_38); transformer_h_9_attn_attn_dropout = None permute_39 = matmul_19.permute(0, 2, 1, 3); matmul_19 = None contiguous_9 = permute_39.contiguous(); permute_39 = None size_146 = contiguous_9.size() getitem_121 = size_146[slice(None, -2, None)]; size_146 = None add_125 = getitem_121 + (768,); getitem_121 = None view_116 = contiguous_9.view(add_125); contiguous_9 = add_125 = None size_147 = view_116.size() getitem_122 = size_147[slice(None, -1, None)]; size_147 = None add_126 = getitem_122 + (768,); getitem_122 = None transformer_h_9_attn_c_proj_bias = getattr(self.transformer.h, "9").attn.c_proj.bias size_148 = view_116.size(-1) view_117 = view_116.view(-1, size_148); view_116 = size_148 = None transformer_h_9_attn_c_proj_weight = getattr(self.transformer.h, "9").attn.c_proj.weight addmm_37 = torch.addmm(transformer_h_9_attn_c_proj_bias, view_117, transformer_h_9_attn_c_proj_weight); transformer_h_9_attn_c_proj_bias = view_117 = transformer_h_9_attn_c_proj_weight = None view_118 = addmm_37.view(add_126); addmm_37 = add_126 = None transformer_h_9_attn_resid_dropout = getattr(self.transformer.h, "9").attn.resid_dropout(view_118); view_118 = None add_127 = transformer_h_9_attn_resid_dropout + add_119; transformer_h_9_attn_resid_dropout = add_119 = None transformer_h_9_ln_2 = getattr(self.transformer.h, "9").ln_2(add_127) size_149 = transformer_h_9_ln_2.size() getitem_123 = size_149[slice(None, -1, None)]; size_149 = None add_128 = getitem_123 + (3072,); getitem_123 = None transformer_h_9_mlp_c_fc_bias = getattr(self.transformer.h, "9").mlp.c_fc.bias size_150 = transformer_h_9_ln_2.size(-1) view_119 = transformer_h_9_ln_2.view(-1, size_150); transformer_h_9_ln_2 = size_150 = None transformer_h_9_mlp_c_fc_weight = getattr(self.transformer.h, "9").mlp.c_fc.weight addmm_38 = torch.addmm(transformer_h_9_mlp_c_fc_bias, view_119, transformer_h_9_mlp_c_fc_weight); transformer_h_9_mlp_c_fc_bias = view_119 = transformer_h_9_mlp_c_fc_weight = None view_120 = addmm_38.view(add_128); addmm_38 = add_128 = None mul_37 = 0.5 * view_120 pow_20 = torch.pow(view_120, 3.0) mul_38 = 0.044715 * pow_20; pow_20 = None add_129 = view_120 + mul_38; view_120 = mul_38 = None mul_39 = 0.7978845608028654 * add_129; add_129 = None tanh_9 = torch.tanh(mul_39); mul_39 = None add_130 = 1.0 + tanh_9; tanh_9 = None mul_40 = mul_37 * add_130; mul_37 = add_130 = None size_151 = mul_40.size() getitem_124 = size_151[slice(None, -1, None)]; size_151 = None add_131 = getitem_124 + (768,); getitem_124 = None transformer_h_9_mlp_c_proj_bias = getattr(self.transformer.h, "9").mlp.c_proj.bias size_152 = mul_40.size(-1) view_121 = mul_40.view(-1, size_152); mul_40 = size_152 = None transformer_h_9_mlp_c_proj_weight = getattr(self.transformer.h, "9").mlp.c_proj.weight addmm_39 = torch.addmm(transformer_h_9_mlp_c_proj_bias, view_121, transformer_h_9_mlp_c_proj_weight); transformer_h_9_mlp_c_proj_bias = view_121 = transformer_h_9_mlp_c_proj_weight = None view_122 = addmm_39.view(add_131); addmm_39 = add_131 = None transformer_h_9_mlp_dropout = getattr(self.transformer.h, "9").mlp.dropout(view_122); view_122 = None add_132 = add_127 + transformer_h_9_mlp_dropout; add_127 = transformer_h_9_mlp_dropout = None transformer_h_10_ln_1 = getattr(self.transformer.h, "10").ln_1(add_132) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, add_132, transformer_h_10_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, add_132, transformer_h_10_ln_1): # No stacktrace found for following nodes size_153 = transformer_h_10_ln_1.size() getitem_125 = size_153[slice(None, -1, None)]; size_153 = None add_133 = getitem_125 + (2304,); getitem_125 = None transformer_h_10_attn_c_attn_bias = getattr(self.transformer.h, "10").attn.c_attn.bias size_154 = transformer_h_10_ln_1.size(-1) view_123 = transformer_h_10_ln_1.view(-1, size_154); transformer_h_10_ln_1 = size_154 = None transformer_h_10_attn_c_attn_weight = getattr(self.transformer.h, "10").attn.c_attn.weight addmm_40 = torch.addmm(transformer_h_10_attn_c_attn_bias, view_123, transformer_h_10_attn_c_attn_weight); transformer_h_10_attn_c_attn_bias = view_123 = transformer_h_10_attn_c_attn_weight = None view_124 = addmm_40.view(add_133); addmm_40 = add_133 = None split_10 = view_124.split(768, dim = 2); view_124 = None getitem_126 = split_10[0] getitem_127 = split_10[1] getitem_128 = split_10[2]; split_10 = None size_155 = getitem_126.size() getitem_129 = size_155[slice(None, -1, None)]; size_155 = None add_134 = getitem_129 + (12, 64); getitem_129 = None view_125 = getitem_126.view(add_134); getitem_126 = add_134 = None permute_40 = view_125.permute(0, 2, 1, 3); view_125 = None size_156 = getitem_127.size() getitem_130 = size_156[slice(None, -1, None)]; size_156 = None add_135 = getitem_130 + (12, 64); getitem_130 = None view_126 = getitem_127.view(add_135); getitem_127 = add_135 = None permute_41 = view_126.permute(0, 2, 1, 3); view_126 = None size_157 = getitem_128.size() getitem_131 = size_157[slice(None, -1, None)]; size_157 = None add_136 = getitem_131 + (12, 64); getitem_131 = None view_127 = getitem_128.view(add_136); getitem_128 = add_136 = None permute_42 = view_127.permute(0, 2, 1, 3); view_127 = None transpose_10 = permute_41.transpose(-1, -2) matmul_20 = torch.matmul(permute_40, transpose_10); transpose_10 = None size_158 = permute_42.size(-1) pow_21 = size_158 ** 0.5; size_158 = None getattr_82 = matmul_20.dtype getattr_83 = matmul_20.device full_20 = torch.full([], pow_21, dtype = getattr_82, device = getattr_83); pow_21 = getattr_82 = getattr_83 = None truediv_10 = matmul_20 / full_20; matmul_20 = full_20 = None size_159 = permute_40.size(-2); permute_40 = None size_160 = permute_41.size(-2) transformer_h_10_attn_bias = getattr(self.transformer.h, "10").attn.bias sub_11 = size_160 - size_159; size_159 = None getitem_132 = transformer_h_10_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_11, size_160, None), slice(None, size_160, None))]; transformer_h_10_attn_bias = sub_11 = size_160 = None getattr_84 = truediv_10.dtype finfo_10 = torch.finfo(getattr_84); getattr_84 = None getattr_85 = finfo_10.min; finfo_10 = None getattr_86 = truediv_10.dtype full_21 = torch.full([], getattr_85, dtype = getattr_86); getattr_85 = getattr_86 = None getattr_87 = truediv_10.device to_21 = full_21.to(getattr_87); full_21 = getattr_87 = None getattr_88 = truediv_10.dtype to_22 = truediv_10.to(getattr_88); truediv_10 = getattr_88 = None where_10 = torch.where(getitem_132, to_22, to_21); getitem_132 = to_22 = to_21 = None add_137 = where_10 + mul; where_10 = None softmax_10 = torch.nn.functional.softmax(add_137, dim = -1, _stacklevel = 3, dtype = None); add_137 = None getattr_89 = permute_42.dtype type_11 = softmax_10.type(getattr_89); softmax_10 = getattr_89 = None transformer_h_10_attn_attn_dropout = getattr(self.transformer.h, "10").attn.attn_dropout(type_11); type_11 = None matmul_21 = torch.matmul(transformer_h_10_attn_attn_dropout, permute_42); transformer_h_10_attn_attn_dropout = None permute_43 = matmul_21.permute(0, 2, 1, 3); matmul_21 = None contiguous_10 = permute_43.contiguous(); permute_43 = None size_161 = contiguous_10.size() getitem_133 = size_161[slice(None, -2, None)]; size_161 = None add_138 = getitem_133 + (768,); getitem_133 = None view_128 = contiguous_10.view(add_138); contiguous_10 = add_138 = None size_162 = view_128.size() getitem_134 = size_162[slice(None, -1, None)]; size_162 = None add_139 = getitem_134 + (768,); getitem_134 = None transformer_h_10_attn_c_proj_bias = getattr(self.transformer.h, "10").attn.c_proj.bias size_163 = view_128.size(-1) view_129 = view_128.view(-1, size_163); view_128 = size_163 = None transformer_h_10_attn_c_proj_weight = getattr(self.transformer.h, "10").attn.c_proj.weight addmm_41 = torch.addmm(transformer_h_10_attn_c_proj_bias, view_129, transformer_h_10_attn_c_proj_weight); transformer_h_10_attn_c_proj_bias = view_129 = transformer_h_10_attn_c_proj_weight = None view_130 = addmm_41.view(add_139); addmm_41 = add_139 = None transformer_h_10_attn_resid_dropout = getattr(self.transformer.h, "10").attn.resid_dropout(view_130); view_130 = None add_140 = transformer_h_10_attn_resid_dropout + add_132; transformer_h_10_attn_resid_dropout = add_132 = None transformer_h_10_ln_2 = getattr(self.transformer.h, "10").ln_2(add_140) size_164 = transformer_h_10_ln_2.size() getitem_135 = size_164[slice(None, -1, None)]; size_164 = None add_141 = getitem_135 + (3072,); getitem_135 = None transformer_h_10_mlp_c_fc_bias = getattr(self.transformer.h, "10").mlp.c_fc.bias size_165 = transformer_h_10_ln_2.size(-1) view_131 = transformer_h_10_ln_2.view(-1, size_165); transformer_h_10_ln_2 = size_165 = None transformer_h_10_mlp_c_fc_weight = getattr(self.transformer.h, "10").mlp.c_fc.weight addmm_42 = torch.addmm(transformer_h_10_mlp_c_fc_bias, view_131, transformer_h_10_mlp_c_fc_weight); transformer_h_10_mlp_c_fc_bias = view_131 = transformer_h_10_mlp_c_fc_weight = None view_132 = addmm_42.view(add_141); addmm_42 = add_141 = None mul_41 = 0.5 * view_132 pow_22 = torch.pow(view_132, 3.0) mul_42 = 0.044715 * pow_22; pow_22 = None add_142 = view_132 + mul_42; view_132 = mul_42 = None mul_43 = 0.7978845608028654 * add_142; add_142 = None tanh_10 = torch.tanh(mul_43); mul_43 = None add_143 = 1.0 + tanh_10; tanh_10 = None mul_44 = mul_41 * add_143; mul_41 = add_143 = None size_166 = mul_44.size() getitem_136 = size_166[slice(None, -1, None)]; size_166 = None add_144 = getitem_136 + (768,); getitem_136 = None transformer_h_10_mlp_c_proj_bias = getattr(self.transformer.h, "10").mlp.c_proj.bias size_167 = mul_44.size(-1) view_133 = mul_44.view(-1, size_167); mul_44 = size_167 = None transformer_h_10_mlp_c_proj_weight = getattr(self.transformer.h, "10").mlp.c_proj.weight addmm_43 = torch.addmm(transformer_h_10_mlp_c_proj_bias, view_133, transformer_h_10_mlp_c_proj_weight); transformer_h_10_mlp_c_proj_bias = view_133 = transformer_h_10_mlp_c_proj_weight = None view_134 = addmm_43.view(add_144); addmm_43 = add_144 = None transformer_h_10_mlp_dropout = getattr(self.transformer.h, "10").mlp.dropout(view_134); view_134 = None add_145 = add_140 + transformer_h_10_mlp_dropout; add_140 = transformer_h_10_mlp_dropout = None transformer_h_11_ln_1 = getattr(self.transformer.h, "11").ln_1(add_145) return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, permute_41, permute_42, add_145, transformer_h_11_ln_1) class GraphModule(torch.nn.Module): def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, permute_41, permute_42, add_145, transformer_h_11_ln_1): # No stacktrace found for following nodes size_168 = transformer_h_11_ln_1.size() getitem_137 = size_168[slice(None, -1, None)]; size_168 = None add_146 = getitem_137 + (2304,); getitem_137 = None transformer_h_11_attn_c_attn_bias = getattr(self.transformer.h, "11").attn.c_attn.bias size_169 = transformer_h_11_ln_1.size(-1) view_135 = transformer_h_11_ln_1.view(-1, size_169); transformer_h_11_ln_1 = size_169 = None transformer_h_11_attn_c_attn_weight = getattr(self.transformer.h, "11").attn.c_attn.weight addmm_44 = torch.addmm(transformer_h_11_attn_c_attn_bias, view_135, transformer_h_11_attn_c_attn_weight); transformer_h_11_attn_c_attn_bias = view_135 = transformer_h_11_attn_c_attn_weight = None view_136 = addmm_44.view(add_146); addmm_44 = add_146 = None split_11 = view_136.split(768, dim = 2); view_136 = None getitem_138 = split_11[0] getitem_139 = split_11[1] getitem_140 = split_11[2]; split_11 = None size_170 = getitem_138.size() getitem_141 = size_170[slice(None, -1, None)]; size_170 = None add_147 = getitem_141 + (12, 64); getitem_141 = None view_137 = getitem_138.view(add_147); getitem_138 = add_147 = None permute_44 = view_137.permute(0, 2, 1, 3); view_137 = None size_171 = getitem_139.size() getitem_142 = size_171[slice(None, -1, None)]; size_171 = None add_148 = getitem_142 + (12, 64); getitem_142 = None view_138 = getitem_139.view(add_148); getitem_139 = add_148 = None permute_45 = view_138.permute(0, 2, 1, 3); view_138 = None size_172 = getitem_140.size() getitem_143 = size_172[slice(None, -1, None)]; size_172 = None add_149 = getitem_143 + (12, 64); getitem_143 = None view_139 = getitem_140.view(add_149); getitem_140 = add_149 = None permute_46 = view_139.permute(0, 2, 1, 3); view_139 = None transpose_11 = permute_45.transpose(-1, -2) matmul_22 = torch.matmul(permute_44, transpose_11); transpose_11 = None size_173 = permute_46.size(-1) pow_23 = size_173 ** 0.5; size_173 = None getattr_90 = matmul_22.dtype getattr_91 = matmul_22.device full_22 = torch.full([], pow_23, dtype = getattr_90, device = getattr_91); pow_23 = getattr_90 = getattr_91 = None truediv_11 = matmul_22 / full_22; matmul_22 = full_22 = None size_174 = permute_44.size(-2); permute_44 = None size_175 = permute_45.size(-2) transformer_h_11_attn_bias = getattr(self.transformer.h, "11").attn.bias sub_12 = size_175 - size_174; size_174 = None getitem_144 = transformer_h_11_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_12, size_175, None), slice(None, size_175, None))]; transformer_h_11_attn_bias = sub_12 = size_175 = None getattr_92 = truediv_11.dtype finfo_11 = torch.finfo(getattr_92); getattr_92 = None getattr_93 = finfo_11.min; finfo_11 = None getattr_94 = truediv_11.dtype full_23 = torch.full([], getattr_93, dtype = getattr_94); getattr_93 = getattr_94 = None getattr_95 = truediv_11.device to_23 = full_23.to(getattr_95); full_23 = getattr_95 = None getattr_96 = truediv_11.dtype to_24 = truediv_11.to(getattr_96); truediv_11 = getattr_96 = None where_11 = torch.where(getitem_144, to_24, to_23); getitem_144 = to_24 = to_23 = None add_150 = where_11 + mul; where_11 = mul = None softmax_11 = torch.nn.functional.softmax(add_150, dim = -1, _stacklevel = 3, dtype = None); add_150 = None getattr_97 = permute_46.dtype type_12 = softmax_11.type(getattr_97); softmax_11 = getattr_97 = None transformer_h_11_attn_attn_dropout = getattr(self.transformer.h, "11").attn.attn_dropout(type_12); type_12 = None matmul_23 = torch.matmul(transformer_h_11_attn_attn_dropout, permute_46); transformer_h_11_attn_attn_dropout = None permute_47 = matmul_23.permute(0, 2, 1, 3); matmul_23 = None contiguous_11 = permute_47.contiguous(); permute_47 = None size_176 = contiguous_11.size() getitem_145 = size_176[slice(None, -2, None)]; size_176 = None add_151 = getitem_145 + (768,); getitem_145 = None view_140 = contiguous_11.view(add_151); contiguous_11 = add_151 = None size_177 = view_140.size() getitem_146 = size_177[slice(None, -1, None)]; size_177 = None add_152 = getitem_146 + (768,); getitem_146 = None transformer_h_11_attn_c_proj_bias = getattr(self.transformer.h, "11").attn.c_proj.bias size_178 = view_140.size(-1) view_141 = view_140.view(-1, size_178); view_140 = size_178 = None transformer_h_11_attn_c_proj_weight = getattr(self.transformer.h, "11").attn.c_proj.weight addmm_45 = torch.addmm(transformer_h_11_attn_c_proj_bias, view_141, transformer_h_11_attn_c_proj_weight); transformer_h_11_attn_c_proj_bias = view_141 = transformer_h_11_attn_c_proj_weight = None view_142 = addmm_45.view(add_152); addmm_45 = add_152 = None transformer_h_11_attn_resid_dropout = getattr(self.transformer.h, "11").attn.resid_dropout(view_142); view_142 = None add_153 = transformer_h_11_attn_resid_dropout + add_145; transformer_h_11_attn_resid_dropout = add_145 = None transformer_h_11_ln_2 = getattr(self.transformer.h, "11").ln_2(add_153) size_179 = transformer_h_11_ln_2.size() getitem_147 = size_179[slice(None, -1, None)]; size_179 = None add_154 = getitem_147 + (3072,); getitem_147 = None transformer_h_11_mlp_c_fc_bias = getattr(self.transformer.h, "11").mlp.c_fc.bias size_180 = transformer_h_11_ln_2.size(-1) view_143 = transformer_h_11_ln_2.view(-1, size_180); transformer_h_11_ln_2 = size_180 = None transformer_h_11_mlp_c_fc_weight = getattr(self.transformer.h, "11").mlp.c_fc.weight addmm_46 = torch.addmm(transformer_h_11_mlp_c_fc_bias, view_143, transformer_h_11_mlp_c_fc_weight); transformer_h_11_mlp_c_fc_bias = view_143 = transformer_h_11_mlp_c_fc_weight = None view_144 = addmm_46.view(add_154); addmm_46 = add_154 = None mul_45 = 0.5 * view_144 pow_24 = torch.pow(view_144, 3.0) mul_46 = 0.044715 * pow_24; pow_24 = None add_155 = view_144 + mul_46; view_144 = mul_46 = None mul_47 = 0.7978845608028654 * add_155; add_155 = None tanh_11 = torch.tanh(mul_47); mul_47 = None add_156 = 1.0 + tanh_11; tanh_11 = None mul_48 = mul_45 * add_156; mul_45 = add_156 = None size_181 = mul_48.size() getitem_148 = size_181[slice(None, -1, None)]; size_181 = None add_157 = getitem_148 + (768,); getitem_148 = None transformer_h_11_mlp_c_proj_bias = getattr(self.transformer.h, "11").mlp.c_proj.bias size_182 = mul_48.size(-1) view_145 = mul_48.view(-1, size_182); mul_48 = size_182 = None transformer_h_11_mlp_c_proj_weight = getattr(self.transformer.h, "11").mlp.c_proj.weight addmm_47 = torch.addmm(transformer_h_11_mlp_c_proj_bias, view_145, transformer_h_11_mlp_c_proj_weight); transformer_h_11_mlp_c_proj_bias = view_145 = transformer_h_11_mlp_c_proj_weight = None view_146 = addmm_47.view(add_157); addmm_47 = add_157 = None transformer_h_11_mlp_dropout = getattr(self.transformer.h, "11").mlp.dropout(view_146); view_146 = None add_158 = add_153 + transformer_h_11_mlp_dropout; add_153 = transformer_h_11_mlp_dropout = None transformer_ln_f = self.transformer.ln_f(add_158); add_158 = None return (labels, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, permute_41, permute_42, permute_45, permute_46, transformer_ln_f) class GraphModule(torch.nn.Module): def forward(self, labels, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, permute_41, permute_42, permute_45, permute_46, transformer_ln_f): # No stacktrace found for following nodes view_147 = transformer_ln_f.view(add_2); transformer_ln_f = add_2 = None lm_head = self.lm_head(view_147); view_147 = None getitem_149 = lm_head[(Ellipsis, slice(None, -1, None), slice(None, None, None))] contiguous_12 = getitem_149.contiguous(); getitem_149 = None getitem_150 = labels[(Ellipsis, slice(1, None, None))]; labels = None contiguous_13 = getitem_150.contiguous(); getitem_150 = None size_183 = contiguous_12.size(-1) view_148 = contiguous_12.view(-1, size_183); contiguous_12 = size_183 = None view_149 = contiguous_13.view(-1); contiguous_13 = None crossentropyloss_0 = self.crossentropyloss_0(view_148, view_149); view_148 = view_149 = None return {'loss': crossentropyloss_0, 'logits': lm_head, 'past_key_values': ((permute_1, permute_2), (permute_5, permute_6), (permute_9, permute_10), (permute_13, permute_14), (permute_17, permute_18), (permute_21, permute_22), (permute_25, permute_26), (permute_29, permute_30), (permute_33, permute_34), (permute_37, permute_38), (permute_41, permute_42), (permute_45, permute_46))}
['class GraphModule(torch.nn.Module):\n def forward(self, input_ids : torch.Tensor, attention_mask : torch.Tensor, labels : torch.Tensor):\n # No stacktrace found for following nodes\n size = input_ids.size()\n getitem = size[-1]\n view = input_ids.view(-1, getitem); input_ids = getitem = None\n size_1 = view.size()\n getitem_1 = size_1[0]; size_1 = None\n getitem_2 = size[-1]\n add = getitem_2 + 0; getitem_2 = None\n getattr_1 = view.device\n arange = torch.arange(0, add, dtype = torch.int64, device = getattr_1); add = getattr_1 = None\n unsqueeze = arange.unsqueeze(0); arange = None\n getitem_3 = size[-1]\n view_1 = unsqueeze.view(-1, getitem_3); unsqueeze = getitem_3 = None\n le = getitem_1 <= 0\n view_2 = attention_mask.view(getitem_1, -1); attention_mask = getitem_1 = None\n getitem_4 = view_2[(slice(None, None, None), None, None, slice(None, None, None))]; view_2 = None\n to = getitem_4.to(dtype = torch.float32); getitem_4 = None\n sub = 1.0 - to; to = None\n mul = sub * -3.4028234663852886e+38; sub = None\n transformer_wte = self.transformer.wte(view); view = None\n transformer_wpe = self.transformer.wpe(view_1); view_1 = None\n add_1 = transformer_wte + transformer_wpe; transformer_wte = transformer_wpe = None\n transformer_drop = self.transformer.drop(add_1); add_1 = None\n size_2 = transformer_drop.size(-1)\n add_2 = size + (size_2,); size = size_2 = None\n transformer_h_0_ln_1 = getattr(self.transformer.h, "0").ln_1(transformer_drop)\n return (labels, mul, transformer_drop, add_2, transformer_h_0_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, transformer_drop, add_2, transformer_h_0_ln_1):\n # No stacktrace found for following nodes\n size_3 = transformer_h_0_ln_1.size()\n getitem_5 = size_3[slice(None, -1, None)]; size_3 = None\n add_3 = getitem_5 + (2304,); getitem_5 = None\n transformer_h_0_attn_c_attn_bias = getattr(self.transformer.h, "0").attn.c_attn.bias\n size_4 = transformer_h_0_ln_1.size(-1)\n view_3 = transformer_h_0_ln_1.view(-1, size_4); transformer_h_0_ln_1 = size_4 = None\n transformer_h_0_attn_c_attn_weight = getattr(self.transformer.h, "0").attn.c_attn.weight\n addmm = torch.addmm(transformer_h_0_attn_c_attn_bias, view_3, transformer_h_0_attn_c_attn_weight); transformer_h_0_attn_c_attn_bias = view_3 = transformer_h_0_attn_c_attn_weight = None\n view_4 = addmm.view(add_3); addmm = add_3 = None\n split = view_4.split(768, dim = 2); view_4 = None\n getitem_6 = split[0]\n getitem_7 = split[1]\n getitem_8 = split[2]; split = None\n size_5 = getitem_6.size()\n getitem_9 = size_5[slice(None, -1, None)]; size_5 = None\n add_4 = getitem_9 + (12, 64); getitem_9 = None\n view_5 = getitem_6.view(add_4); getitem_6 = add_4 = None\n permute = view_5.permute(0, 2, 1, 3); view_5 = None\n size_6 = getitem_7.size()\n getitem_10 = size_6[slice(None, -1, None)]; size_6 = None\n add_5 = getitem_10 + (12, 64); getitem_10 = None\n view_6 = getitem_7.view(add_5); getitem_7 = add_5 = None\n permute_1 = view_6.permute(0, 2, 1, 3); view_6 = None\n size_7 = getitem_8.size()\n getitem_11 = size_7[slice(None, -1, None)]; size_7 = None\n add_6 = getitem_11 + (12, 64); getitem_11 = None\n view_7 = getitem_8.view(add_6); getitem_8 = add_6 = None\n permute_2 = view_7.permute(0, 2, 1, 3); view_7 = None\n transpose = permute_1.transpose(-1, -2)\n matmul = torch.matmul(permute, transpose); transpose = None\n size_8 = permute_2.size(-1)\n pow_1 = size_8 ** 0.5; size_8 = None\n getattr_2 = matmul.dtype\n getattr_3 = matmul.device\n full = torch.full([], pow_1, dtype = getattr_2, device = getattr_3); pow_1 = getattr_2 = getattr_3 = None\n truediv = matmul / full; matmul = full = None\n size_9 = permute.size(-2); permute = None\n size_10 = permute_1.size(-2)\n transformer_h_0_attn_bias = getattr(self.transformer.h, "0").attn.bias\n sub_1 = size_10 - size_9; size_9 = None\n getitem_12 = transformer_h_0_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_1, size_10, None), slice(None, size_10, None))]; transformer_h_0_attn_bias = sub_1 = size_10 = None\n getattr_4 = truediv.dtype\n finfo = torch.finfo(getattr_4); getattr_4 = None\n getattr_5 = finfo.min; finfo = None\n getattr_6 = truediv.dtype\n full_1 = torch.full([], getattr_5, dtype = getattr_6); getattr_5 = getattr_6 = None\n getattr_7 = truediv.device\n to_1 = full_1.to(getattr_7); full_1 = getattr_7 = None\n getattr_8 = truediv.dtype\n to_2 = truediv.to(getattr_8); truediv = getattr_8 = None\n where = torch.where(getitem_12, to_2, to_1); getitem_12 = to_2 = to_1 = None\n add_7 = where + mul; where = None\n softmax = torch.nn.functional.softmax(add_7, dim = -1, _stacklevel = 3, dtype = None); add_7 = None\n getattr_9 = permute_2.dtype\n type_1 = softmax.type(getattr_9); softmax = getattr_9 = None\n transformer_h_0_attn_attn_dropout = getattr(self.transformer.h, "0").attn.attn_dropout(type_1); type_1 = None\n matmul_1 = torch.matmul(transformer_h_0_attn_attn_dropout, permute_2); transformer_h_0_attn_attn_dropout = None\n permute_3 = matmul_1.permute(0, 2, 1, 3); matmul_1 = None\n contiguous = permute_3.contiguous(); permute_3 = None\n size_11 = contiguous.size()\n getitem_13 = size_11[slice(None, -2, None)]; size_11 = None\n add_8 = getitem_13 + (768,); getitem_13 = None\n view_8 = contiguous.view(add_8); contiguous = add_8 = None\n size_12 = view_8.size()\n getitem_14 = size_12[slice(None, -1, None)]; size_12 = None\n add_9 = getitem_14 + (768,); getitem_14 = None\n transformer_h_0_attn_c_proj_bias = getattr(self.transformer.h, "0").attn.c_proj.bias\n size_13 = view_8.size(-1)\n view_9 = view_8.view(-1, size_13); view_8 = size_13 = None\n transformer_h_0_attn_c_proj_weight = getattr(self.transformer.h, "0").attn.c_proj.weight\n addmm_1 = torch.addmm(transformer_h_0_attn_c_proj_bias, view_9, transformer_h_0_attn_c_proj_weight); transformer_h_0_attn_c_proj_bias = view_9 = transformer_h_0_attn_c_proj_weight = None\n view_10 = addmm_1.view(add_9); addmm_1 = add_9 = None\n transformer_h_0_attn_resid_dropout = getattr(self.transformer.h, "0").attn.resid_dropout(view_10); view_10 = None\n add_10 = transformer_h_0_attn_resid_dropout + transformer_drop; transformer_h_0_attn_resid_dropout = transformer_drop = None\n transformer_h_0_ln_2 = getattr(self.transformer.h, "0").ln_2(add_10)\n size_14 = transformer_h_0_ln_2.size()\n getitem_15 = size_14[slice(None, -1, None)]; size_14 = None\n add_11 = getitem_15 + (3072,); getitem_15 = None\n transformer_h_0_mlp_c_fc_bias = getattr(self.transformer.h, "0").mlp.c_fc.bias\n size_15 = transformer_h_0_ln_2.size(-1)\n view_11 = transformer_h_0_ln_2.view(-1, size_15); transformer_h_0_ln_2 = size_15 = None\n transformer_h_0_mlp_c_fc_weight = getattr(self.transformer.h, "0").mlp.c_fc.weight\n addmm_2 = torch.addmm(transformer_h_0_mlp_c_fc_bias, view_11, transformer_h_0_mlp_c_fc_weight); transformer_h_0_mlp_c_fc_bias = view_11 = transformer_h_0_mlp_c_fc_weight = None\n view_12 = addmm_2.view(add_11); addmm_2 = add_11 = None\n mul_1 = 0.5 * view_12\n pow_2 = torch.pow(view_12, 3.0)\n mul_2 = 0.044715 * pow_2; pow_2 = None\n add_12 = view_12 + mul_2; view_12 = mul_2 = None\n mul_3 = 0.7978845608028654 * add_12; add_12 = None\n tanh = torch.tanh(mul_3); mul_3 = None\n add_13 = 1.0 + tanh; tanh = None\n mul_4 = mul_1 * add_13; mul_1 = add_13 = None\n size_16 = mul_4.size()\n getitem_16 = size_16[slice(None, -1, None)]; size_16 = None\n add_14 = getitem_16 + (768,); getitem_16 = None\n transformer_h_0_mlp_c_proj_bias = getattr(self.transformer.h, "0").mlp.c_proj.bias\n size_17 = mul_4.size(-1)\n view_13 = mul_4.view(-1, size_17); mul_4 = size_17 = None\n transformer_h_0_mlp_c_proj_weight = getattr(self.transformer.h, "0").mlp.c_proj.weight\n addmm_3 = torch.addmm(transformer_h_0_mlp_c_proj_bias, view_13, transformer_h_0_mlp_c_proj_weight); transformer_h_0_mlp_c_proj_bias = view_13 = transformer_h_0_mlp_c_proj_weight = None\n view_14 = addmm_3.view(add_14); addmm_3 = add_14 = None\n transformer_h_0_mlp_dropout = getattr(self.transformer.h, "0").mlp.dropout(view_14); view_14 = None\n add_15 = add_10 + transformer_h_0_mlp_dropout; add_10 = transformer_h_0_mlp_dropout = None\n transformer_h_1_ln_1 = getattr(self.transformer.h, "1").ln_1(add_15)\n return (labels, mul, add_2, permute_1, permute_2, add_15, transformer_h_1_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, add_15, transformer_h_1_ln_1):\n # No stacktrace found for following nodes\n size_18 = transformer_h_1_ln_1.size()\n getitem_17 = size_18[slice(None, -1, None)]; size_18 = None\n add_16 = getitem_17 + (2304,); getitem_17 = None\n transformer_h_1_attn_c_attn_bias = getattr(self.transformer.h, "1").attn.c_attn.bias\n size_19 = transformer_h_1_ln_1.size(-1)\n view_15 = transformer_h_1_ln_1.view(-1, size_19); transformer_h_1_ln_1 = size_19 = None\n transformer_h_1_attn_c_attn_weight = getattr(self.transformer.h, "1").attn.c_attn.weight\n addmm_4 = torch.addmm(transformer_h_1_attn_c_attn_bias, view_15, transformer_h_1_attn_c_attn_weight); transformer_h_1_attn_c_attn_bias = view_15 = transformer_h_1_attn_c_attn_weight = None\n view_16 = addmm_4.view(add_16); addmm_4 = add_16 = None\n split_1 = view_16.split(768, dim = 2); view_16 = None\n getitem_18 = split_1[0]\n getitem_19 = split_1[1]\n getitem_20 = split_1[2]; split_1 = None\n size_20 = getitem_18.size()\n getitem_21 = size_20[slice(None, -1, None)]; size_20 = None\n add_17 = getitem_21 + (12, 64); getitem_21 = None\n view_17 = getitem_18.view(add_17); getitem_18 = add_17 = None\n permute_4 = view_17.permute(0, 2, 1, 3); view_17 = None\n size_21 = getitem_19.size()\n getitem_22 = size_21[slice(None, -1, None)]; size_21 = None\n add_18 = getitem_22 + (12, 64); getitem_22 = None\n view_18 = getitem_19.view(add_18); getitem_19 = add_18 = None\n permute_5 = view_18.permute(0, 2, 1, 3); view_18 = None\n size_22 = getitem_20.size()\n getitem_23 = size_22[slice(None, -1, None)]; size_22 = None\n add_19 = getitem_23 + (12, 64); getitem_23 = None\n view_19 = getitem_20.view(add_19); getitem_20 = add_19 = None\n permute_6 = view_19.permute(0, 2, 1, 3); view_19 = None\n transpose_1 = permute_5.transpose(-1, -2)\n matmul_2 = torch.matmul(permute_4, transpose_1); transpose_1 = None\n size_23 = permute_6.size(-1)\n pow_3 = size_23 ** 0.5; size_23 = None\n getattr_10 = matmul_2.dtype\n getattr_11 = matmul_2.device\n full_2 = torch.full([], pow_3, dtype = getattr_10, device = getattr_11); pow_3 = getattr_10 = getattr_11 = None\n truediv_1 = matmul_2 / full_2; matmul_2 = full_2 = None\n size_24 = permute_4.size(-2); permute_4 = None\n size_25 = permute_5.size(-2)\n transformer_h_1_attn_bias = getattr(self.transformer.h, "1").attn.bias\n sub_2 = size_25 - size_24; size_24 = None\n getitem_24 = transformer_h_1_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_2, size_25, None), slice(None, size_25, None))]; transformer_h_1_attn_bias = sub_2 = size_25 = None\n getattr_12 = truediv_1.dtype\n finfo_1 = torch.finfo(getattr_12); getattr_12 = None\n getattr_13 = finfo_1.min; finfo_1 = None\n getattr_14 = truediv_1.dtype\n full_3 = torch.full([], getattr_13, dtype = getattr_14); getattr_13 = getattr_14 = None\n getattr_15 = truediv_1.device\n to_3 = full_3.to(getattr_15); full_3 = getattr_15 = None\n getattr_16 = truediv_1.dtype\n to_4 = truediv_1.to(getattr_16); truediv_1 = getattr_16 = None\n where_1 = torch.where(getitem_24, to_4, to_3); getitem_24 = to_4 = to_3 = None\n add_20 = where_1 + mul; where_1 = None\n softmax_1 = torch.nn.functional.softmax(add_20, dim = -1, _stacklevel = 3, dtype = None); add_20 = None\n getattr_17 = permute_6.dtype\n type_2 = softmax_1.type(getattr_17); softmax_1 = getattr_17 = None\n transformer_h_1_attn_attn_dropout = getattr(self.transformer.h, "1").attn.attn_dropout(type_2); type_2 = None\n matmul_3 = torch.matmul(transformer_h_1_attn_attn_dropout, permute_6); transformer_h_1_attn_attn_dropout = None\n permute_7 = matmul_3.permute(0, 2, 1, 3); matmul_3 = None\n contiguous_1 = permute_7.contiguous(); permute_7 = None\n size_26 = contiguous_1.size()\n getitem_25 = size_26[slice(None, -2, None)]; size_26 = None\n add_21 = getitem_25 + (768,); getitem_25 = None\n view_20 = contiguous_1.view(add_21); contiguous_1 = add_21 = None\n size_27 = view_20.size()\n getitem_26 = size_27[slice(None, -1, None)]; size_27 = None\n add_22 = getitem_26 + (768,); getitem_26 = None\n transformer_h_1_attn_c_proj_bias = getattr(self.transformer.h, "1").attn.c_proj.bias\n size_28 = view_20.size(-1)\n view_21 = view_20.view(-1, size_28); view_20 = size_28 = None\n transformer_h_1_attn_c_proj_weight = getattr(self.transformer.h, "1").attn.c_proj.weight\n addmm_5 = torch.addmm(transformer_h_1_attn_c_proj_bias, view_21, transformer_h_1_attn_c_proj_weight); transformer_h_1_attn_c_proj_bias = view_21 = transformer_h_1_attn_c_proj_weight = None\n view_22 = addmm_5.view(add_22); addmm_5 = add_22 = None\n transformer_h_1_attn_resid_dropout = getattr(self.transformer.h, "1").attn.resid_dropout(view_22); view_22 = None\n add_23 = transformer_h_1_attn_resid_dropout + add_15; transformer_h_1_attn_resid_dropout = add_15 = None\n transformer_h_1_ln_2 = getattr(self.transformer.h, "1").ln_2(add_23)\n size_29 = transformer_h_1_ln_2.size()\n getitem_27 = size_29[slice(None, -1, None)]; size_29 = None\n add_24 = getitem_27 + (3072,); getitem_27 = None\n transformer_h_1_mlp_c_fc_bias = getattr(self.transformer.h, "1").mlp.c_fc.bias\n size_30 = transformer_h_1_ln_2.size(-1)\n view_23 = transformer_h_1_ln_2.view(-1, size_30); transformer_h_1_ln_2 = size_30 = None\n transformer_h_1_mlp_c_fc_weight = getattr(self.transformer.h, "1").mlp.c_fc.weight\n addmm_6 = torch.addmm(transformer_h_1_mlp_c_fc_bias, view_23, transformer_h_1_mlp_c_fc_weight); transformer_h_1_mlp_c_fc_bias = view_23 = transformer_h_1_mlp_c_fc_weight = None\n view_24 = addmm_6.view(add_24); addmm_6 = add_24 = None\n mul_5 = 0.5 * view_24\n pow_4 = torch.pow(view_24, 3.0)\n mul_6 = 0.044715 * pow_4; pow_4 = None\n add_25 = view_24 + mul_6; view_24 = mul_6 = None\n mul_7 = 0.7978845608028654 * add_25; add_25 = None\n tanh_1 = torch.tanh(mul_7); mul_7 = None\n add_26 = 1.0 + tanh_1; tanh_1 = None\n mul_8 = mul_5 * add_26; mul_5 = add_26 = None\n size_31 = mul_8.size()\n getitem_28 = size_31[slice(None, -1, None)]; size_31 = None\n add_27 = getitem_28 + (768,); getitem_28 = None\n transformer_h_1_mlp_c_proj_bias = getattr(self.transformer.h, "1").mlp.c_proj.bias\n size_32 = mul_8.size(-1)\n view_25 = mul_8.view(-1, size_32); mul_8 = size_32 = None\n transformer_h_1_mlp_c_proj_weight = getattr(self.transformer.h, "1").mlp.c_proj.weight\n addmm_7 = torch.addmm(transformer_h_1_mlp_c_proj_bias, view_25, transformer_h_1_mlp_c_proj_weight); transformer_h_1_mlp_c_proj_bias = view_25 = transformer_h_1_mlp_c_proj_weight = None\n view_26 = addmm_7.view(add_27); addmm_7 = add_27 = None\n transformer_h_1_mlp_dropout = getattr(self.transformer.h, "1").mlp.dropout(view_26); view_26 = None\n add_28 = add_23 + transformer_h_1_mlp_dropout; add_23 = transformer_h_1_mlp_dropout = None\n transformer_h_2_ln_1 = getattr(self.transformer.h, "2").ln_1(add_28)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, add_28, transformer_h_2_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, add_28, transformer_h_2_ln_1):\n # No stacktrace found for following nodes\n size_33 = transformer_h_2_ln_1.size()\n getitem_29 = size_33[slice(None, -1, None)]; size_33 = None\n add_29 = getitem_29 + (2304,); getitem_29 = None\n transformer_h_2_attn_c_attn_bias = getattr(self.transformer.h, "2").attn.c_attn.bias\n size_34 = transformer_h_2_ln_1.size(-1)\n view_27 = transformer_h_2_ln_1.view(-1, size_34); transformer_h_2_ln_1 = size_34 = None\n transformer_h_2_attn_c_attn_weight = getattr(self.transformer.h, "2").attn.c_attn.weight\n addmm_8 = torch.addmm(transformer_h_2_attn_c_attn_bias, view_27, transformer_h_2_attn_c_attn_weight); transformer_h_2_attn_c_attn_bias = view_27 = transformer_h_2_attn_c_attn_weight = None\n view_28 = addmm_8.view(add_29); addmm_8 = add_29 = None\n split_2 = view_28.split(768, dim = 2); view_28 = None\n getitem_30 = split_2[0]\n getitem_31 = split_2[1]\n getitem_32 = split_2[2]; split_2 = None\n size_35 = getitem_30.size()\n getitem_33 = size_35[slice(None, -1, None)]; size_35 = None\n add_30 = getitem_33 + (12, 64); getitem_33 = None\n view_29 = getitem_30.view(add_30); getitem_30 = add_30 = None\n permute_8 = view_29.permute(0, 2, 1, 3); view_29 = None\n size_36 = getitem_31.size()\n getitem_34 = size_36[slice(None, -1, None)]; size_36 = None\n add_31 = getitem_34 + (12, 64); getitem_34 = None\n view_30 = getitem_31.view(add_31); getitem_31 = add_31 = None\n permute_9 = view_30.permute(0, 2, 1, 3); view_30 = None\n size_37 = getitem_32.size()\n getitem_35 = size_37[slice(None, -1, None)]; size_37 = None\n add_32 = getitem_35 + (12, 64); getitem_35 = None\n view_31 = getitem_32.view(add_32); getitem_32 = add_32 = None\n permute_10 = view_31.permute(0, 2, 1, 3); view_31 = None\n transpose_2 = permute_9.transpose(-1, -2)\n matmul_4 = torch.matmul(permute_8, transpose_2); transpose_2 = None\n size_38 = permute_10.size(-1)\n pow_5 = size_38 ** 0.5; size_38 = None\n getattr_18 = matmul_4.dtype\n getattr_19 = matmul_4.device\n full_4 = torch.full([], pow_5, dtype = getattr_18, device = getattr_19); pow_5 = getattr_18 = getattr_19 = None\n truediv_2 = matmul_4 / full_4; matmul_4 = full_4 = None\n size_39 = permute_8.size(-2); permute_8 = None\n size_40 = permute_9.size(-2)\n transformer_h_2_attn_bias = getattr(self.transformer.h, "2").attn.bias\n sub_3 = size_40 - size_39; size_39 = None\n getitem_36 = transformer_h_2_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_3, size_40, None), slice(None, size_40, None))]; transformer_h_2_attn_bias = sub_3 = size_40 = None\n getattr_20 = truediv_2.dtype\n finfo_2 = torch.finfo(getattr_20); getattr_20 = None\n getattr_21 = finfo_2.min; finfo_2 = None\n getattr_22 = truediv_2.dtype\n full_5 = torch.full([], getattr_21, dtype = getattr_22); getattr_21 = getattr_22 = None\n getattr_23 = truediv_2.device\n to_5 = full_5.to(getattr_23); full_5 = getattr_23 = None\n getattr_24 = truediv_2.dtype\n to_6 = truediv_2.to(getattr_24); truediv_2 = getattr_24 = None\n where_2 = torch.where(getitem_36, to_6, to_5); getitem_36 = to_6 = to_5 = None\n add_33 = where_2 + mul; where_2 = None\n softmax_2 = torch.nn.functional.softmax(add_33, dim = -1, _stacklevel = 3, dtype = None); add_33 = None\n getattr_25 = permute_10.dtype\n type_3 = softmax_2.type(getattr_25); softmax_2 = getattr_25 = None\n transformer_h_2_attn_attn_dropout = getattr(self.transformer.h, "2").attn.attn_dropout(type_3); type_3 = None\n matmul_5 = torch.matmul(transformer_h_2_attn_attn_dropout, permute_10); transformer_h_2_attn_attn_dropout = None\n permute_11 = matmul_5.permute(0, 2, 1, 3); matmul_5 = None\n contiguous_2 = permute_11.contiguous(); permute_11 = None\n size_41 = contiguous_2.size()\n getitem_37 = size_41[slice(None, -2, None)]; size_41 = None\n add_34 = getitem_37 + (768,); getitem_37 = None\n view_32 = contiguous_2.view(add_34); contiguous_2 = add_34 = None\n size_42 = view_32.size()\n getitem_38 = size_42[slice(None, -1, None)]; size_42 = None\n add_35 = getitem_38 + (768,); getitem_38 = None\n transformer_h_2_attn_c_proj_bias = getattr(self.transformer.h, "2").attn.c_proj.bias\n size_43 = view_32.size(-1)\n view_33 = view_32.view(-1, size_43); view_32 = size_43 = None\n transformer_h_2_attn_c_proj_weight = getattr(self.transformer.h, "2").attn.c_proj.weight\n addmm_9 = torch.addmm(transformer_h_2_attn_c_proj_bias, view_33, transformer_h_2_attn_c_proj_weight); transformer_h_2_attn_c_proj_bias = view_33 = transformer_h_2_attn_c_proj_weight = None\n view_34 = addmm_9.view(add_35); addmm_9 = add_35 = None\n transformer_h_2_attn_resid_dropout = getattr(self.transformer.h, "2").attn.resid_dropout(view_34); view_34 = None\n add_36 = transformer_h_2_attn_resid_dropout + add_28; transformer_h_2_attn_resid_dropout = add_28 = None\n transformer_h_2_ln_2 = getattr(self.transformer.h, "2").ln_2(add_36)\n size_44 = transformer_h_2_ln_2.size()\n getitem_39 = size_44[slice(None, -1, None)]; size_44 = None\n add_37 = getitem_39 + (3072,); getitem_39 = None\n transformer_h_2_mlp_c_fc_bias = getattr(self.transformer.h, "2").mlp.c_fc.bias\n size_45 = transformer_h_2_ln_2.size(-1)\n view_35 = transformer_h_2_ln_2.view(-1, size_45); transformer_h_2_ln_2 = size_45 = None\n transformer_h_2_mlp_c_fc_weight = getattr(self.transformer.h, "2").mlp.c_fc.weight\n addmm_10 = torch.addmm(transformer_h_2_mlp_c_fc_bias, view_35, transformer_h_2_mlp_c_fc_weight); transformer_h_2_mlp_c_fc_bias = view_35 = transformer_h_2_mlp_c_fc_weight = None\n view_36 = addmm_10.view(add_37); addmm_10 = add_37 = None\n mul_9 = 0.5 * view_36\n pow_6 = torch.pow(view_36, 3.0)\n mul_10 = 0.044715 * pow_6; pow_6 = None\n add_38 = view_36 + mul_10; view_36 = mul_10 = None\n mul_11 = 0.7978845608028654 * add_38; add_38 = None\n tanh_2 = torch.tanh(mul_11); mul_11 = None\n add_39 = 1.0 + tanh_2; tanh_2 = None\n mul_12 = mul_9 * add_39; mul_9 = add_39 = None\n size_46 = mul_12.size()\n getitem_40 = size_46[slice(None, -1, None)]; size_46 = None\n add_40 = getitem_40 + (768,); getitem_40 = None\n transformer_h_2_mlp_c_proj_bias = getattr(self.transformer.h, "2").mlp.c_proj.bias\n size_47 = mul_12.size(-1)\n view_37 = mul_12.view(-1, size_47); mul_12 = size_47 = None\n transformer_h_2_mlp_c_proj_weight = getattr(self.transformer.h, "2").mlp.c_proj.weight\n addmm_11 = torch.addmm(transformer_h_2_mlp_c_proj_bias, view_37, transformer_h_2_mlp_c_proj_weight); transformer_h_2_mlp_c_proj_bias = view_37 = transformer_h_2_mlp_c_proj_weight = None\n view_38 = addmm_11.view(add_40); addmm_11 = add_40 = None\n transformer_h_2_mlp_dropout = getattr(self.transformer.h, "2").mlp.dropout(view_38); view_38 = None\n add_41 = add_36 + transformer_h_2_mlp_dropout; add_36 = transformer_h_2_mlp_dropout = None\n transformer_h_3_ln_1 = getattr(self.transformer.h, "3").ln_1(add_41)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, add_41, transformer_h_3_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, add_41, transformer_h_3_ln_1):\n # No stacktrace found for following nodes\n size_48 = transformer_h_3_ln_1.size()\n getitem_41 = size_48[slice(None, -1, None)]; size_48 = None\n add_42 = getitem_41 + (2304,); getitem_41 = None\n transformer_h_3_attn_c_attn_bias = getattr(self.transformer.h, "3").attn.c_attn.bias\n size_49 = transformer_h_3_ln_1.size(-1)\n view_39 = transformer_h_3_ln_1.view(-1, size_49); transformer_h_3_ln_1 = size_49 = None\n transformer_h_3_attn_c_attn_weight = getattr(self.transformer.h, "3").attn.c_attn.weight\n addmm_12 = torch.addmm(transformer_h_3_attn_c_attn_bias, view_39, transformer_h_3_attn_c_attn_weight); transformer_h_3_attn_c_attn_bias = view_39 = transformer_h_3_attn_c_attn_weight = None\n view_40 = addmm_12.view(add_42); addmm_12 = add_42 = None\n split_3 = view_40.split(768, dim = 2); view_40 = None\n getitem_42 = split_3[0]\n getitem_43 = split_3[1]\n getitem_44 = split_3[2]; split_3 = None\n size_50 = getitem_42.size()\n getitem_45 = size_50[slice(None, -1, None)]; size_50 = None\n add_43 = getitem_45 + (12, 64); getitem_45 = None\n view_41 = getitem_42.view(add_43); getitem_42 = add_43 = None\n permute_12 = view_41.permute(0, 2, 1, 3); view_41 = None\n size_51 = getitem_43.size()\n getitem_46 = size_51[slice(None, -1, None)]; size_51 = None\n add_44 = getitem_46 + (12, 64); getitem_46 = None\n view_42 = getitem_43.view(add_44); getitem_43 = add_44 = None\n permute_13 = view_42.permute(0, 2, 1, 3); view_42 = None\n size_52 = getitem_44.size()\n getitem_47 = size_52[slice(None, -1, None)]; size_52 = None\n add_45 = getitem_47 + (12, 64); getitem_47 = None\n view_43 = getitem_44.view(add_45); getitem_44 = add_45 = None\n permute_14 = view_43.permute(0, 2, 1, 3); view_43 = None\n transpose_3 = permute_13.transpose(-1, -2)\n matmul_6 = torch.matmul(permute_12, transpose_3); transpose_3 = None\n size_53 = permute_14.size(-1)\n pow_7 = size_53 ** 0.5; size_53 = None\n getattr_26 = matmul_6.dtype\n getattr_27 = matmul_6.device\n full_6 = torch.full([], pow_7, dtype = getattr_26, device = getattr_27); pow_7 = getattr_26 = getattr_27 = None\n truediv_3 = matmul_6 / full_6; matmul_6 = full_6 = None\n size_54 = permute_12.size(-2); permute_12 = None\n size_55 = permute_13.size(-2)\n transformer_h_3_attn_bias = getattr(self.transformer.h, "3").attn.bias\n sub_4 = size_55 - size_54; size_54 = None\n getitem_48 = transformer_h_3_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_4, size_55, None), slice(None, size_55, None))]; transformer_h_3_attn_bias = sub_4 = size_55 = None\n getattr_28 = truediv_3.dtype\n finfo_3 = torch.finfo(getattr_28); getattr_28 = None\n getattr_29 = finfo_3.min; finfo_3 = None\n getattr_30 = truediv_3.dtype\n full_7 = torch.full([], getattr_29, dtype = getattr_30); getattr_29 = getattr_30 = None\n getattr_31 = truediv_3.device\n to_7 = full_7.to(getattr_31); full_7 = getattr_31 = None\n getattr_32 = truediv_3.dtype\n to_8 = truediv_3.to(getattr_32); truediv_3 = getattr_32 = None\n where_3 = torch.where(getitem_48, to_8, to_7); getitem_48 = to_8 = to_7 = None\n add_46 = where_3 + mul; where_3 = None\n softmax_3 = torch.nn.functional.softmax(add_46, dim = -1, _stacklevel = 3, dtype = None); add_46 = None\n getattr_33 = permute_14.dtype\n type_4 = softmax_3.type(getattr_33); softmax_3 = getattr_33 = None\n transformer_h_3_attn_attn_dropout = getattr(self.transformer.h, "3").attn.attn_dropout(type_4); type_4 = None\n matmul_7 = torch.matmul(transformer_h_3_attn_attn_dropout, permute_14); transformer_h_3_attn_attn_dropout = None\n permute_15 = matmul_7.permute(0, 2, 1, 3); matmul_7 = None\n contiguous_3 = permute_15.contiguous(); permute_15 = None\n size_56 = contiguous_3.size()\n getitem_49 = size_56[slice(None, -2, None)]; size_56 = None\n add_47 = getitem_49 + (768,); getitem_49 = None\n view_44 = contiguous_3.view(add_47); contiguous_3 = add_47 = None\n size_57 = view_44.size()\n getitem_50 = size_57[slice(None, -1, None)]; size_57 = None\n add_48 = getitem_50 + (768,); getitem_50 = None\n transformer_h_3_attn_c_proj_bias = getattr(self.transformer.h, "3").attn.c_proj.bias\n size_58 = view_44.size(-1)\n view_45 = view_44.view(-1, size_58); view_44 = size_58 = None\n transformer_h_3_attn_c_proj_weight = getattr(self.transformer.h, "3").attn.c_proj.weight\n addmm_13 = torch.addmm(transformer_h_3_attn_c_proj_bias, view_45, transformer_h_3_attn_c_proj_weight); transformer_h_3_attn_c_proj_bias = view_45 = transformer_h_3_attn_c_proj_weight = None\n view_46 = addmm_13.view(add_48); addmm_13 = add_48 = None\n transformer_h_3_attn_resid_dropout = getattr(self.transformer.h, "3").attn.resid_dropout(view_46); view_46 = None\n add_49 = transformer_h_3_attn_resid_dropout + add_41; transformer_h_3_attn_resid_dropout = add_41 = None\n transformer_h_3_ln_2 = getattr(self.transformer.h, "3").ln_2(add_49)\n size_59 = transformer_h_3_ln_2.size()\n getitem_51 = size_59[slice(None, -1, None)]; size_59 = None\n add_50 = getitem_51 + (3072,); getitem_51 = None\n transformer_h_3_mlp_c_fc_bias = getattr(self.transformer.h, "3").mlp.c_fc.bias\n size_60 = transformer_h_3_ln_2.size(-1)\n view_47 = transformer_h_3_ln_2.view(-1, size_60); transformer_h_3_ln_2 = size_60 = None\n transformer_h_3_mlp_c_fc_weight = getattr(self.transformer.h, "3").mlp.c_fc.weight\n addmm_14 = torch.addmm(transformer_h_3_mlp_c_fc_bias, view_47, transformer_h_3_mlp_c_fc_weight); transformer_h_3_mlp_c_fc_bias = view_47 = transformer_h_3_mlp_c_fc_weight = None\n view_48 = addmm_14.view(add_50); addmm_14 = add_50 = None\n mul_13 = 0.5 * view_48\n pow_8 = torch.pow(view_48, 3.0)\n mul_14 = 0.044715 * pow_8; pow_8 = None\n add_51 = view_48 + mul_14; view_48 = mul_14 = None\n mul_15 = 0.7978845608028654 * add_51; add_51 = None\n tanh_3 = torch.tanh(mul_15); mul_15 = None\n add_52 = 1.0 + tanh_3; tanh_3 = None\n mul_16 = mul_13 * add_52; mul_13 = add_52 = None\n size_61 = mul_16.size()\n getitem_52 = size_61[slice(None, -1, None)]; size_61 = None\n add_53 = getitem_52 + (768,); getitem_52 = None\n transformer_h_3_mlp_c_proj_bias = getattr(self.transformer.h, "3").mlp.c_proj.bias\n size_62 = mul_16.size(-1)\n view_49 = mul_16.view(-1, size_62); mul_16 = size_62 = None\n transformer_h_3_mlp_c_proj_weight = getattr(self.transformer.h, "3").mlp.c_proj.weight\n addmm_15 = torch.addmm(transformer_h_3_mlp_c_proj_bias, view_49, transformer_h_3_mlp_c_proj_weight); transformer_h_3_mlp_c_proj_bias = view_49 = transformer_h_3_mlp_c_proj_weight = None\n view_50 = addmm_15.view(add_53); addmm_15 = add_53 = None\n transformer_h_3_mlp_dropout = getattr(self.transformer.h, "3").mlp.dropout(view_50); view_50 = None\n add_54 = add_49 + transformer_h_3_mlp_dropout; add_49 = transformer_h_3_mlp_dropout = None\n transformer_h_4_ln_1 = getattr(self.transformer.h, "4").ln_1(add_54)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, add_54, transformer_h_4_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, add_54, transformer_h_4_ln_1):\n # No stacktrace found for following nodes\n size_63 = transformer_h_4_ln_1.size()\n getitem_53 = size_63[slice(None, -1, None)]; size_63 = None\n add_55 = getitem_53 + (2304,); getitem_53 = None\n transformer_h_4_attn_c_attn_bias = getattr(self.transformer.h, "4").attn.c_attn.bias\n size_64 = transformer_h_4_ln_1.size(-1)\n view_51 = transformer_h_4_ln_1.view(-1, size_64); transformer_h_4_ln_1 = size_64 = None\n transformer_h_4_attn_c_attn_weight = getattr(self.transformer.h, "4").attn.c_attn.weight\n addmm_16 = torch.addmm(transformer_h_4_attn_c_attn_bias, view_51, transformer_h_4_attn_c_attn_weight); transformer_h_4_attn_c_attn_bias = view_51 = transformer_h_4_attn_c_attn_weight = None\n view_52 = addmm_16.view(add_55); addmm_16 = add_55 = None\n split_4 = view_52.split(768, dim = 2); view_52 = None\n getitem_54 = split_4[0]\n getitem_55 = split_4[1]\n getitem_56 = split_4[2]; split_4 = None\n size_65 = getitem_54.size()\n getitem_57 = size_65[slice(None, -1, None)]; size_65 = None\n add_56 = getitem_57 + (12, 64); getitem_57 = None\n view_53 = getitem_54.view(add_56); getitem_54 = add_56 = None\n permute_16 = view_53.permute(0, 2, 1, 3); view_53 = None\n size_66 = getitem_55.size()\n getitem_58 = size_66[slice(None, -1, None)]; size_66 = None\n add_57 = getitem_58 + (12, 64); getitem_58 = None\n view_54 = getitem_55.view(add_57); getitem_55 = add_57 = None\n permute_17 = view_54.permute(0, 2, 1, 3); view_54 = None\n size_67 = getitem_56.size()\n getitem_59 = size_67[slice(None, -1, None)]; size_67 = None\n add_58 = getitem_59 + (12, 64); getitem_59 = None\n view_55 = getitem_56.view(add_58); getitem_56 = add_58 = None\n permute_18 = view_55.permute(0, 2, 1, 3); view_55 = None\n transpose_4 = permute_17.transpose(-1, -2)\n matmul_8 = torch.matmul(permute_16, transpose_4); transpose_4 = None\n size_68 = permute_18.size(-1)\n pow_9 = size_68 ** 0.5; size_68 = None\n getattr_34 = matmul_8.dtype\n getattr_35 = matmul_8.device\n full_8 = torch.full([], pow_9, dtype = getattr_34, device = getattr_35); pow_9 = getattr_34 = getattr_35 = None\n truediv_4 = matmul_8 / full_8; matmul_8 = full_8 = None\n size_69 = permute_16.size(-2); permute_16 = None\n size_70 = permute_17.size(-2)\n transformer_h_4_attn_bias = getattr(self.transformer.h, "4").attn.bias\n sub_5 = size_70 - size_69; size_69 = None\n getitem_60 = transformer_h_4_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_5, size_70, None), slice(None, size_70, None))]; transformer_h_4_attn_bias = sub_5 = size_70 = None\n getattr_36 = truediv_4.dtype\n finfo_4 = torch.finfo(getattr_36); getattr_36 = None\n getattr_37 = finfo_4.min; finfo_4 = None\n getattr_38 = truediv_4.dtype\n full_9 = torch.full([], getattr_37, dtype = getattr_38); getattr_37 = getattr_38 = None\n getattr_39 = truediv_4.device\n to_9 = full_9.to(getattr_39); full_9 = getattr_39 = None\n getattr_40 = truediv_4.dtype\n to_10 = truediv_4.to(getattr_40); truediv_4 = getattr_40 = None\n where_4 = torch.where(getitem_60, to_10, to_9); getitem_60 = to_10 = to_9 = None\n add_59 = where_4 + mul; where_4 = None\n softmax_4 = torch.nn.functional.softmax(add_59, dim = -1, _stacklevel = 3, dtype = None); add_59 = None\n getattr_41 = permute_18.dtype\n type_5 = softmax_4.type(getattr_41); softmax_4 = getattr_41 = None\n transformer_h_4_attn_attn_dropout = getattr(self.transformer.h, "4").attn.attn_dropout(type_5); type_5 = None\n matmul_9 = torch.matmul(transformer_h_4_attn_attn_dropout, permute_18); transformer_h_4_attn_attn_dropout = None\n permute_19 = matmul_9.permute(0, 2, 1, 3); matmul_9 = None\n contiguous_4 = permute_19.contiguous(); permute_19 = None\n size_71 = contiguous_4.size()\n getitem_61 = size_71[slice(None, -2, None)]; size_71 = None\n add_60 = getitem_61 + (768,); getitem_61 = None\n view_56 = contiguous_4.view(add_60); contiguous_4 = add_60 = None\n size_72 = view_56.size()\n getitem_62 = size_72[slice(None, -1, None)]; size_72 = None\n add_61 = getitem_62 + (768,); getitem_62 = None\n transformer_h_4_attn_c_proj_bias = getattr(self.transformer.h, "4").attn.c_proj.bias\n size_73 = view_56.size(-1)\n view_57 = view_56.view(-1, size_73); view_56 = size_73 = None\n transformer_h_4_attn_c_proj_weight = getattr(self.transformer.h, "4").attn.c_proj.weight\n addmm_17 = torch.addmm(transformer_h_4_attn_c_proj_bias, view_57, transformer_h_4_attn_c_proj_weight); transformer_h_4_attn_c_proj_bias = view_57 = transformer_h_4_attn_c_proj_weight = None\n view_58 = addmm_17.view(add_61); addmm_17 = add_61 = None\n transformer_h_4_attn_resid_dropout = getattr(self.transformer.h, "4").attn.resid_dropout(view_58); view_58 = None\n add_62 = transformer_h_4_attn_resid_dropout + add_54; transformer_h_4_attn_resid_dropout = add_54 = None\n transformer_h_4_ln_2 = getattr(self.transformer.h, "4").ln_2(add_62)\n size_74 = transformer_h_4_ln_2.size()\n getitem_63 = size_74[slice(None, -1, None)]; size_74 = None\n add_63 = getitem_63 + (3072,); getitem_63 = None\n transformer_h_4_mlp_c_fc_bias = getattr(self.transformer.h, "4").mlp.c_fc.bias\n size_75 = transformer_h_4_ln_2.size(-1)\n view_59 = transformer_h_4_ln_2.view(-1, size_75); transformer_h_4_ln_2 = size_75 = None\n transformer_h_4_mlp_c_fc_weight = getattr(self.transformer.h, "4").mlp.c_fc.weight\n addmm_18 = torch.addmm(transformer_h_4_mlp_c_fc_bias, view_59, transformer_h_4_mlp_c_fc_weight); transformer_h_4_mlp_c_fc_bias = view_59 = transformer_h_4_mlp_c_fc_weight = None\n view_60 = addmm_18.view(add_63); addmm_18 = add_63 = None\n mul_17 = 0.5 * view_60\n pow_10 = torch.pow(view_60, 3.0)\n mul_18 = 0.044715 * pow_10; pow_10 = None\n add_64 = view_60 + mul_18; view_60 = mul_18 = None\n mul_19 = 0.7978845608028654 * add_64; add_64 = None\n tanh_4 = torch.tanh(mul_19); mul_19 = None\n add_65 = 1.0 + tanh_4; tanh_4 = None\n mul_20 = mul_17 * add_65; mul_17 = add_65 = None\n size_76 = mul_20.size()\n getitem_64 = size_76[slice(None, -1, None)]; size_76 = None\n add_66 = getitem_64 + (768,); getitem_64 = None\n transformer_h_4_mlp_c_proj_bias = getattr(self.transformer.h, "4").mlp.c_proj.bias\n size_77 = mul_20.size(-1)\n view_61 = mul_20.view(-1, size_77); mul_20 = size_77 = None\n transformer_h_4_mlp_c_proj_weight = getattr(self.transformer.h, "4").mlp.c_proj.weight\n addmm_19 = torch.addmm(transformer_h_4_mlp_c_proj_bias, view_61, transformer_h_4_mlp_c_proj_weight); transformer_h_4_mlp_c_proj_bias = view_61 = transformer_h_4_mlp_c_proj_weight = None\n view_62 = addmm_19.view(add_66); addmm_19 = add_66 = None\n transformer_h_4_mlp_dropout = getattr(self.transformer.h, "4").mlp.dropout(view_62); view_62 = None\n add_67 = add_62 + transformer_h_4_mlp_dropout; add_62 = transformer_h_4_mlp_dropout = None\n transformer_h_5_ln_1 = getattr(self.transformer.h, "5").ln_1(add_67)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, add_67, transformer_h_5_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, add_67, transformer_h_5_ln_1):\n # No stacktrace found for following nodes\n size_78 = transformer_h_5_ln_1.size()\n getitem_65 = size_78[slice(None, -1, None)]; size_78 = None\n add_68 = getitem_65 + (2304,); getitem_65 = None\n transformer_h_5_attn_c_attn_bias = getattr(self.transformer.h, "5").attn.c_attn.bias\n size_79 = transformer_h_5_ln_1.size(-1)\n view_63 = transformer_h_5_ln_1.view(-1, size_79); transformer_h_5_ln_1 = size_79 = None\n transformer_h_5_attn_c_attn_weight = getattr(self.transformer.h, "5").attn.c_attn.weight\n addmm_20 = torch.addmm(transformer_h_5_attn_c_attn_bias, view_63, transformer_h_5_attn_c_attn_weight); transformer_h_5_attn_c_attn_bias = view_63 = transformer_h_5_attn_c_attn_weight = None\n view_64 = addmm_20.view(add_68); addmm_20 = add_68 = None\n split_5 = view_64.split(768, dim = 2); view_64 = None\n getitem_66 = split_5[0]\n getitem_67 = split_5[1]\n getitem_68 = split_5[2]; split_5 = None\n size_80 = getitem_66.size()\n getitem_69 = size_80[slice(None, -1, None)]; size_80 = None\n add_69 = getitem_69 + (12, 64); getitem_69 = None\n view_65 = getitem_66.view(add_69); getitem_66 = add_69 = None\n permute_20 = view_65.permute(0, 2, 1, 3); view_65 = None\n size_81 = getitem_67.size()\n getitem_70 = size_81[slice(None, -1, None)]; size_81 = None\n add_70 = getitem_70 + (12, 64); getitem_70 = None\n view_66 = getitem_67.view(add_70); getitem_67 = add_70 = None\n permute_21 = view_66.permute(0, 2, 1, 3); view_66 = None\n size_82 = getitem_68.size()\n getitem_71 = size_82[slice(None, -1, None)]; size_82 = None\n add_71 = getitem_71 + (12, 64); getitem_71 = None\n view_67 = getitem_68.view(add_71); getitem_68 = add_71 = None\n permute_22 = view_67.permute(0, 2, 1, 3); view_67 = None\n transpose_5 = permute_21.transpose(-1, -2)\n matmul_10 = torch.matmul(permute_20, transpose_5); transpose_5 = None\n size_83 = permute_22.size(-1)\n pow_11 = size_83 ** 0.5; size_83 = None\n getattr_42 = matmul_10.dtype\n getattr_43 = matmul_10.device\n full_10 = torch.full([], pow_11, dtype = getattr_42, device = getattr_43); pow_11 = getattr_42 = getattr_43 = None\n truediv_5 = matmul_10 / full_10; matmul_10 = full_10 = None\n size_84 = permute_20.size(-2); permute_20 = None\n size_85 = permute_21.size(-2)\n transformer_h_5_attn_bias = getattr(self.transformer.h, "5").attn.bias\n sub_6 = size_85 - size_84; size_84 = None\n getitem_72 = transformer_h_5_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_6, size_85, None), slice(None, size_85, None))]; transformer_h_5_attn_bias = sub_6 = size_85 = None\n getattr_44 = truediv_5.dtype\n finfo_5 = torch.finfo(getattr_44); getattr_44 = None\n getattr_45 = finfo_5.min; finfo_5 = None\n getattr_46 = truediv_5.dtype\n full_11 = torch.full([], getattr_45, dtype = getattr_46); getattr_45 = getattr_46 = None\n getattr_47 = truediv_5.device\n to_11 = full_11.to(getattr_47); full_11 = getattr_47 = None\n getattr_48 = truediv_5.dtype\n to_12 = truediv_5.to(getattr_48); truediv_5 = getattr_48 = None\n where_5 = torch.where(getitem_72, to_12, to_11); getitem_72 = to_12 = to_11 = None\n add_72 = where_5 + mul; where_5 = None\n softmax_5 = torch.nn.functional.softmax(add_72, dim = -1, _stacklevel = 3, dtype = None); add_72 = None\n getattr_49 = permute_22.dtype\n type_6 = softmax_5.type(getattr_49); softmax_5 = getattr_49 = None\n transformer_h_5_attn_attn_dropout = getattr(self.transformer.h, "5").attn.attn_dropout(type_6); type_6 = None\n matmul_11 = torch.matmul(transformer_h_5_attn_attn_dropout, permute_22); transformer_h_5_attn_attn_dropout = None\n permute_23 = matmul_11.permute(0, 2, 1, 3); matmul_11 = None\n contiguous_5 = permute_23.contiguous(); permute_23 = None\n size_86 = contiguous_5.size()\n getitem_73 = size_86[slice(None, -2, None)]; size_86 = None\n add_73 = getitem_73 + (768,); getitem_73 = None\n view_68 = contiguous_5.view(add_73); contiguous_5 = add_73 = None\n size_87 = view_68.size()\n getitem_74 = size_87[slice(None, -1, None)]; size_87 = None\n add_74 = getitem_74 + (768,); getitem_74 = None\n transformer_h_5_attn_c_proj_bias = getattr(self.transformer.h, "5").attn.c_proj.bias\n size_88 = view_68.size(-1)\n view_69 = view_68.view(-1, size_88); view_68 = size_88 = None\n transformer_h_5_attn_c_proj_weight = getattr(self.transformer.h, "5").attn.c_proj.weight\n addmm_21 = torch.addmm(transformer_h_5_attn_c_proj_bias, view_69, transformer_h_5_attn_c_proj_weight); transformer_h_5_attn_c_proj_bias = view_69 = transformer_h_5_attn_c_proj_weight = None\n view_70 = addmm_21.view(add_74); addmm_21 = add_74 = None\n transformer_h_5_attn_resid_dropout = getattr(self.transformer.h, "5").attn.resid_dropout(view_70); view_70 = None\n add_75 = transformer_h_5_attn_resid_dropout + add_67; transformer_h_5_attn_resid_dropout = add_67 = None\n transformer_h_5_ln_2 = getattr(self.transformer.h, "5").ln_2(add_75)\n size_89 = transformer_h_5_ln_2.size()\n getitem_75 = size_89[slice(None, -1, None)]; size_89 = None\n add_76 = getitem_75 + (3072,); getitem_75 = None\n transformer_h_5_mlp_c_fc_bias = getattr(self.transformer.h, "5").mlp.c_fc.bias\n size_90 = transformer_h_5_ln_2.size(-1)\n view_71 = transformer_h_5_ln_2.view(-1, size_90); transformer_h_5_ln_2 = size_90 = None\n transformer_h_5_mlp_c_fc_weight = getattr(self.transformer.h, "5").mlp.c_fc.weight\n addmm_22 = torch.addmm(transformer_h_5_mlp_c_fc_bias, view_71, transformer_h_5_mlp_c_fc_weight); transformer_h_5_mlp_c_fc_bias = view_71 = transformer_h_5_mlp_c_fc_weight = None\n view_72 = addmm_22.view(add_76); addmm_22 = add_76 = None\n mul_21 = 0.5 * view_72\n pow_12 = torch.pow(view_72, 3.0)\n mul_22 = 0.044715 * pow_12; pow_12 = None\n add_77 = view_72 + mul_22; view_72 = mul_22 = None\n mul_23 = 0.7978845608028654 * add_77; add_77 = None\n tanh_5 = torch.tanh(mul_23); mul_23 = None\n add_78 = 1.0 + tanh_5; tanh_5 = None\n mul_24 = mul_21 * add_78; mul_21 = add_78 = None\n size_91 = mul_24.size()\n getitem_76 = size_91[slice(None, -1, None)]; size_91 = None\n add_79 = getitem_76 + (768,); getitem_76 = None\n transformer_h_5_mlp_c_proj_bias = getattr(self.transformer.h, "5").mlp.c_proj.bias\n size_92 = mul_24.size(-1)\n view_73 = mul_24.view(-1, size_92); mul_24 = size_92 = None\n transformer_h_5_mlp_c_proj_weight = getattr(self.transformer.h, "5").mlp.c_proj.weight\n addmm_23 = torch.addmm(transformer_h_5_mlp_c_proj_bias, view_73, transformer_h_5_mlp_c_proj_weight); transformer_h_5_mlp_c_proj_bias = view_73 = transformer_h_5_mlp_c_proj_weight = None\n view_74 = addmm_23.view(add_79); addmm_23 = add_79 = None\n transformer_h_5_mlp_dropout = getattr(self.transformer.h, "5").mlp.dropout(view_74); view_74 = None\n add_80 = add_75 + transformer_h_5_mlp_dropout; add_75 = transformer_h_5_mlp_dropout = None\n transformer_h_6_ln_1 = getattr(self.transformer.h, "6").ln_1(add_80)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, add_80, transformer_h_6_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, add_80, transformer_h_6_ln_1):\n # No stacktrace found for following nodes\n size_93 = transformer_h_6_ln_1.size()\n getitem_77 = size_93[slice(None, -1, None)]; size_93 = None\n add_81 = getitem_77 + (2304,); getitem_77 = None\n transformer_h_6_attn_c_attn_bias = getattr(self.transformer.h, "6").attn.c_attn.bias\n size_94 = transformer_h_6_ln_1.size(-1)\n view_75 = transformer_h_6_ln_1.view(-1, size_94); transformer_h_6_ln_1 = size_94 = None\n transformer_h_6_attn_c_attn_weight = getattr(self.transformer.h, "6").attn.c_attn.weight\n addmm_24 = torch.addmm(transformer_h_6_attn_c_attn_bias, view_75, transformer_h_6_attn_c_attn_weight); transformer_h_6_attn_c_attn_bias = view_75 = transformer_h_6_attn_c_attn_weight = None\n view_76 = addmm_24.view(add_81); addmm_24 = add_81 = None\n split_6 = view_76.split(768, dim = 2); view_76 = None\n getitem_78 = split_6[0]\n getitem_79 = split_6[1]\n getitem_80 = split_6[2]; split_6 = None\n size_95 = getitem_78.size()\n getitem_81 = size_95[slice(None, -1, None)]; size_95 = None\n add_82 = getitem_81 + (12, 64); getitem_81 = None\n view_77 = getitem_78.view(add_82); getitem_78 = add_82 = None\n permute_24 = view_77.permute(0, 2, 1, 3); view_77 = None\n size_96 = getitem_79.size()\n getitem_82 = size_96[slice(None, -1, None)]; size_96 = None\n add_83 = getitem_82 + (12, 64); getitem_82 = None\n view_78 = getitem_79.view(add_83); getitem_79 = add_83 = None\n permute_25 = view_78.permute(0, 2, 1, 3); view_78 = None\n size_97 = getitem_80.size()\n getitem_83 = size_97[slice(None, -1, None)]; size_97 = None\n add_84 = getitem_83 + (12, 64); getitem_83 = None\n view_79 = getitem_80.view(add_84); getitem_80 = add_84 = None\n permute_26 = view_79.permute(0, 2, 1, 3); view_79 = None\n transpose_6 = permute_25.transpose(-1, -2)\n matmul_12 = torch.matmul(permute_24, transpose_6); transpose_6 = None\n size_98 = permute_26.size(-1)\n pow_13 = size_98 ** 0.5; size_98 = None\n getattr_50 = matmul_12.dtype\n getattr_51 = matmul_12.device\n full_12 = torch.full([], pow_13, dtype = getattr_50, device = getattr_51); pow_13 = getattr_50 = getattr_51 = None\n truediv_6 = matmul_12 / full_12; matmul_12 = full_12 = None\n size_99 = permute_24.size(-2); permute_24 = None\n size_100 = permute_25.size(-2)\n transformer_h_6_attn_bias = getattr(self.transformer.h, "6").attn.bias\n sub_7 = size_100 - size_99; size_99 = None\n getitem_84 = transformer_h_6_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_7, size_100, None), slice(None, size_100, None))]; transformer_h_6_attn_bias = sub_7 = size_100 = None\n getattr_52 = truediv_6.dtype\n finfo_6 = torch.finfo(getattr_52); getattr_52 = None\n getattr_53 = finfo_6.min; finfo_6 = None\n getattr_54 = truediv_6.dtype\n full_13 = torch.full([], getattr_53, dtype = getattr_54); getattr_53 = getattr_54 = None\n getattr_55 = truediv_6.device\n to_13 = full_13.to(getattr_55); full_13 = getattr_55 = None\n getattr_56 = truediv_6.dtype\n to_14 = truediv_6.to(getattr_56); truediv_6 = getattr_56 = None\n where_6 = torch.where(getitem_84, to_14, to_13); getitem_84 = to_14 = to_13 = None\n add_85 = where_6 + mul; where_6 = None\n softmax_6 = torch.nn.functional.softmax(add_85, dim = -1, _stacklevel = 3, dtype = None); add_85 = None\n getattr_57 = permute_26.dtype\n type_7 = softmax_6.type(getattr_57); softmax_6 = getattr_57 = None\n transformer_h_6_attn_attn_dropout = getattr(self.transformer.h, "6").attn.attn_dropout(type_7); type_7 = None\n matmul_13 = torch.matmul(transformer_h_6_attn_attn_dropout, permute_26); transformer_h_6_attn_attn_dropout = None\n permute_27 = matmul_13.permute(0, 2, 1, 3); matmul_13 = None\n contiguous_6 = permute_27.contiguous(); permute_27 = None\n size_101 = contiguous_6.size()\n getitem_85 = size_101[slice(None, -2, None)]; size_101 = None\n add_86 = getitem_85 + (768,); getitem_85 = None\n view_80 = contiguous_6.view(add_86); contiguous_6 = add_86 = None\n size_102 = view_80.size()\n getitem_86 = size_102[slice(None, -1, None)]; size_102 = None\n add_87 = getitem_86 + (768,); getitem_86 = None\n transformer_h_6_attn_c_proj_bias = getattr(self.transformer.h, "6").attn.c_proj.bias\n size_103 = view_80.size(-1)\n view_81 = view_80.view(-1, size_103); view_80 = size_103 = None\n transformer_h_6_attn_c_proj_weight = getattr(self.transformer.h, "6").attn.c_proj.weight\n addmm_25 = torch.addmm(transformer_h_6_attn_c_proj_bias, view_81, transformer_h_6_attn_c_proj_weight); transformer_h_6_attn_c_proj_bias = view_81 = transformer_h_6_attn_c_proj_weight = None\n view_82 = addmm_25.view(add_87); addmm_25 = add_87 = None\n transformer_h_6_attn_resid_dropout = getattr(self.transformer.h, "6").attn.resid_dropout(view_82); view_82 = None\n add_88 = transformer_h_6_attn_resid_dropout + add_80; transformer_h_6_attn_resid_dropout = add_80 = None\n transformer_h_6_ln_2 = getattr(self.transformer.h, "6").ln_2(add_88)\n size_104 = transformer_h_6_ln_2.size()\n getitem_87 = size_104[slice(None, -1, None)]; size_104 = None\n add_89 = getitem_87 + (3072,); getitem_87 = None\n transformer_h_6_mlp_c_fc_bias = getattr(self.transformer.h, "6").mlp.c_fc.bias\n size_105 = transformer_h_6_ln_2.size(-1)\n view_83 = transformer_h_6_ln_2.view(-1, size_105); transformer_h_6_ln_2 = size_105 = None\n transformer_h_6_mlp_c_fc_weight = getattr(self.transformer.h, "6").mlp.c_fc.weight\n addmm_26 = torch.addmm(transformer_h_6_mlp_c_fc_bias, view_83, transformer_h_6_mlp_c_fc_weight); transformer_h_6_mlp_c_fc_bias = view_83 = transformer_h_6_mlp_c_fc_weight = None\n view_84 = addmm_26.view(add_89); addmm_26 = add_89 = None\n mul_25 = 0.5 * view_84\n pow_14 = torch.pow(view_84, 3.0)\n mul_26 = 0.044715 * pow_14; pow_14 = None\n add_90 = view_84 + mul_26; view_84 = mul_26 = None\n mul_27 = 0.7978845608028654 * add_90; add_90 = None\n tanh_6 = torch.tanh(mul_27); mul_27 = None\n add_91 = 1.0 + tanh_6; tanh_6 = None\n mul_28 = mul_25 * add_91; mul_25 = add_91 = None\n size_106 = mul_28.size()\n getitem_88 = size_106[slice(None, -1, None)]; size_106 = None\n add_92 = getitem_88 + (768,); getitem_88 = None\n transformer_h_6_mlp_c_proj_bias = getattr(self.transformer.h, "6").mlp.c_proj.bias\n size_107 = mul_28.size(-1)\n view_85 = mul_28.view(-1, size_107); mul_28 = size_107 = None\n transformer_h_6_mlp_c_proj_weight = getattr(self.transformer.h, "6").mlp.c_proj.weight\n addmm_27 = torch.addmm(transformer_h_6_mlp_c_proj_bias, view_85, transformer_h_6_mlp_c_proj_weight); transformer_h_6_mlp_c_proj_bias = view_85 = transformer_h_6_mlp_c_proj_weight = None\n view_86 = addmm_27.view(add_92); addmm_27 = add_92 = None\n transformer_h_6_mlp_dropout = getattr(self.transformer.h, "6").mlp.dropout(view_86); view_86 = None\n add_93 = add_88 + transformer_h_6_mlp_dropout; add_88 = transformer_h_6_mlp_dropout = None\n transformer_h_7_ln_1 = getattr(self.transformer.h, "7").ln_1(add_93)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, add_93, transformer_h_7_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, add_93, transformer_h_7_ln_1):\n # No stacktrace found for following nodes\n size_108 = transformer_h_7_ln_1.size()\n getitem_89 = size_108[slice(None, -1, None)]; size_108 = None\n add_94 = getitem_89 + (2304,); getitem_89 = None\n transformer_h_7_attn_c_attn_bias = getattr(self.transformer.h, "7").attn.c_attn.bias\n size_109 = transformer_h_7_ln_1.size(-1)\n view_87 = transformer_h_7_ln_1.view(-1, size_109); transformer_h_7_ln_1 = size_109 = None\n transformer_h_7_attn_c_attn_weight = getattr(self.transformer.h, "7").attn.c_attn.weight\n addmm_28 = torch.addmm(transformer_h_7_attn_c_attn_bias, view_87, transformer_h_7_attn_c_attn_weight); transformer_h_7_attn_c_attn_bias = view_87 = transformer_h_7_attn_c_attn_weight = None\n view_88 = addmm_28.view(add_94); addmm_28 = add_94 = None\n split_7 = view_88.split(768, dim = 2); view_88 = None\n getitem_90 = split_7[0]\n getitem_91 = split_7[1]\n getitem_92 = split_7[2]; split_7 = None\n size_110 = getitem_90.size()\n getitem_93 = size_110[slice(None, -1, None)]; size_110 = None\n add_95 = getitem_93 + (12, 64); getitem_93 = None\n view_89 = getitem_90.view(add_95); getitem_90 = add_95 = None\n permute_28 = view_89.permute(0, 2, 1, 3); view_89 = None\n size_111 = getitem_91.size()\n getitem_94 = size_111[slice(None, -1, None)]; size_111 = None\n add_96 = getitem_94 + (12, 64); getitem_94 = None\n view_90 = getitem_91.view(add_96); getitem_91 = add_96 = None\n permute_29 = view_90.permute(0, 2, 1, 3); view_90 = None\n size_112 = getitem_92.size()\n getitem_95 = size_112[slice(None, -1, None)]; size_112 = None\n add_97 = getitem_95 + (12, 64); getitem_95 = None\n view_91 = getitem_92.view(add_97); getitem_92 = add_97 = None\n permute_30 = view_91.permute(0, 2, 1, 3); view_91 = None\n transpose_7 = permute_29.transpose(-1, -2)\n matmul_14 = torch.matmul(permute_28, transpose_7); transpose_7 = None\n size_113 = permute_30.size(-1)\n pow_15 = size_113 ** 0.5; size_113 = None\n getattr_58 = matmul_14.dtype\n getattr_59 = matmul_14.device\n full_14 = torch.full([], pow_15, dtype = getattr_58, device = getattr_59); pow_15 = getattr_58 = getattr_59 = None\n truediv_7 = matmul_14 / full_14; matmul_14 = full_14 = None\n size_114 = permute_28.size(-2); permute_28 = None\n size_115 = permute_29.size(-2)\n transformer_h_7_attn_bias = getattr(self.transformer.h, "7").attn.bias\n sub_8 = size_115 - size_114; size_114 = None\n getitem_96 = transformer_h_7_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_8, size_115, None), slice(None, size_115, None))]; transformer_h_7_attn_bias = sub_8 = size_115 = None\n getattr_60 = truediv_7.dtype\n finfo_7 = torch.finfo(getattr_60); getattr_60 = None\n getattr_61 = finfo_7.min; finfo_7 = None\n getattr_62 = truediv_7.dtype\n full_15 = torch.full([], getattr_61, dtype = getattr_62); getattr_61 = getattr_62 = None\n getattr_63 = truediv_7.device\n to_15 = full_15.to(getattr_63); full_15 = getattr_63 = None\n getattr_64 = truediv_7.dtype\n to_16 = truediv_7.to(getattr_64); truediv_7 = getattr_64 = None\n where_7 = torch.where(getitem_96, to_16, to_15); getitem_96 = to_16 = to_15 = None\n add_98 = where_7 + mul; where_7 = None\n softmax_7 = torch.nn.functional.softmax(add_98, dim = -1, _stacklevel = 3, dtype = None); add_98 = None\n getattr_65 = permute_30.dtype\n type_8 = softmax_7.type(getattr_65); softmax_7 = getattr_65 = None\n transformer_h_7_attn_attn_dropout = getattr(self.transformer.h, "7").attn.attn_dropout(type_8); type_8 = None\n matmul_15 = torch.matmul(transformer_h_7_attn_attn_dropout, permute_30); transformer_h_7_attn_attn_dropout = None\n permute_31 = matmul_15.permute(0, 2, 1, 3); matmul_15 = None\n contiguous_7 = permute_31.contiguous(); permute_31 = None\n size_116 = contiguous_7.size()\n getitem_97 = size_116[slice(None, -2, None)]; size_116 = None\n add_99 = getitem_97 + (768,); getitem_97 = None\n view_92 = contiguous_7.view(add_99); contiguous_7 = add_99 = None\n size_117 = view_92.size()\n getitem_98 = size_117[slice(None, -1, None)]; size_117 = None\n add_100 = getitem_98 + (768,); getitem_98 = None\n transformer_h_7_attn_c_proj_bias = getattr(self.transformer.h, "7").attn.c_proj.bias\n size_118 = view_92.size(-1)\n view_93 = view_92.view(-1, size_118); view_92 = size_118 = None\n transformer_h_7_attn_c_proj_weight = getattr(self.transformer.h, "7").attn.c_proj.weight\n addmm_29 = torch.addmm(transformer_h_7_attn_c_proj_bias, view_93, transformer_h_7_attn_c_proj_weight); transformer_h_7_attn_c_proj_bias = view_93 = transformer_h_7_attn_c_proj_weight = None\n view_94 = addmm_29.view(add_100); addmm_29 = add_100 = None\n transformer_h_7_attn_resid_dropout = getattr(self.transformer.h, "7").attn.resid_dropout(view_94); view_94 = None\n add_101 = transformer_h_7_attn_resid_dropout + add_93; transformer_h_7_attn_resid_dropout = add_93 = None\n transformer_h_7_ln_2 = getattr(self.transformer.h, "7").ln_2(add_101)\n size_119 = transformer_h_7_ln_2.size()\n getitem_99 = size_119[slice(None, -1, None)]; size_119 = None\n add_102 = getitem_99 + (3072,); getitem_99 = None\n transformer_h_7_mlp_c_fc_bias = getattr(self.transformer.h, "7").mlp.c_fc.bias\n size_120 = transformer_h_7_ln_2.size(-1)\n view_95 = transformer_h_7_ln_2.view(-1, size_120); transformer_h_7_ln_2 = size_120 = None\n transformer_h_7_mlp_c_fc_weight = getattr(self.transformer.h, "7").mlp.c_fc.weight\n addmm_30 = torch.addmm(transformer_h_7_mlp_c_fc_bias, view_95, transformer_h_7_mlp_c_fc_weight); transformer_h_7_mlp_c_fc_bias = view_95 = transformer_h_7_mlp_c_fc_weight = None\n view_96 = addmm_30.view(add_102); addmm_30 = add_102 = None\n mul_29 = 0.5 * view_96\n pow_16 = torch.pow(view_96, 3.0)\n mul_30 = 0.044715 * pow_16; pow_16 = None\n add_103 = view_96 + mul_30; view_96 = mul_30 = None\n mul_31 = 0.7978845608028654 * add_103; add_103 = None\n tanh_7 = torch.tanh(mul_31); mul_31 = None\n add_104 = 1.0 + tanh_7; tanh_7 = None\n mul_32 = mul_29 * add_104; mul_29 = add_104 = None\n size_121 = mul_32.size()\n getitem_100 = size_121[slice(None, -1, None)]; size_121 = None\n add_105 = getitem_100 + (768,); getitem_100 = None\n transformer_h_7_mlp_c_proj_bias = getattr(self.transformer.h, "7").mlp.c_proj.bias\n size_122 = mul_32.size(-1)\n view_97 = mul_32.view(-1, size_122); mul_32 = size_122 = None\n transformer_h_7_mlp_c_proj_weight = getattr(self.transformer.h, "7").mlp.c_proj.weight\n addmm_31 = torch.addmm(transformer_h_7_mlp_c_proj_bias, view_97, transformer_h_7_mlp_c_proj_weight); transformer_h_7_mlp_c_proj_bias = view_97 = transformer_h_7_mlp_c_proj_weight = None\n view_98 = addmm_31.view(add_105); addmm_31 = add_105 = None\n transformer_h_7_mlp_dropout = getattr(self.transformer.h, "7").mlp.dropout(view_98); view_98 = None\n add_106 = add_101 + transformer_h_7_mlp_dropout; add_101 = transformer_h_7_mlp_dropout = None\n transformer_h_8_ln_1 = getattr(self.transformer.h, "8").ln_1(add_106)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, add_106, transformer_h_8_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, add_106, transformer_h_8_ln_1):\n # No stacktrace found for following nodes\n size_123 = transformer_h_8_ln_1.size()\n getitem_101 = size_123[slice(None, -1, None)]; size_123 = None\n add_107 = getitem_101 + (2304,); getitem_101 = None\n transformer_h_8_attn_c_attn_bias = getattr(self.transformer.h, "8").attn.c_attn.bias\n size_124 = transformer_h_8_ln_1.size(-1)\n view_99 = transformer_h_8_ln_1.view(-1, size_124); transformer_h_8_ln_1 = size_124 = None\n transformer_h_8_attn_c_attn_weight = getattr(self.transformer.h, "8").attn.c_attn.weight\n addmm_32 = torch.addmm(transformer_h_8_attn_c_attn_bias, view_99, transformer_h_8_attn_c_attn_weight); transformer_h_8_attn_c_attn_bias = view_99 = transformer_h_8_attn_c_attn_weight = None\n view_100 = addmm_32.view(add_107); addmm_32 = add_107 = None\n split_8 = view_100.split(768, dim = 2); view_100 = None\n getitem_102 = split_8[0]\n getitem_103 = split_8[1]\n getitem_104 = split_8[2]; split_8 = None\n size_125 = getitem_102.size()\n getitem_105 = size_125[slice(None, -1, None)]; size_125 = None\n add_108 = getitem_105 + (12, 64); getitem_105 = None\n view_101 = getitem_102.view(add_108); getitem_102 = add_108 = None\n permute_32 = view_101.permute(0, 2, 1, 3); view_101 = None\n size_126 = getitem_103.size()\n getitem_106 = size_126[slice(None, -1, None)]; size_126 = None\n add_109 = getitem_106 + (12, 64); getitem_106 = None\n view_102 = getitem_103.view(add_109); getitem_103 = add_109 = None\n permute_33 = view_102.permute(0, 2, 1, 3); view_102 = None\n size_127 = getitem_104.size()\n getitem_107 = size_127[slice(None, -1, None)]; size_127 = None\n add_110 = getitem_107 + (12, 64); getitem_107 = None\n view_103 = getitem_104.view(add_110); getitem_104 = add_110 = None\n permute_34 = view_103.permute(0, 2, 1, 3); view_103 = None\n transpose_8 = permute_33.transpose(-1, -2)\n matmul_16 = torch.matmul(permute_32, transpose_8); transpose_8 = None\n size_128 = permute_34.size(-1)\n pow_17 = size_128 ** 0.5; size_128 = None\n getattr_66 = matmul_16.dtype\n getattr_67 = matmul_16.device\n full_16 = torch.full([], pow_17, dtype = getattr_66, device = getattr_67); pow_17 = getattr_66 = getattr_67 = None\n truediv_8 = matmul_16 / full_16; matmul_16 = full_16 = None\n size_129 = permute_32.size(-2); permute_32 = None\n size_130 = permute_33.size(-2)\n transformer_h_8_attn_bias = getattr(self.transformer.h, "8").attn.bias\n sub_9 = size_130 - size_129; size_129 = None\n getitem_108 = transformer_h_8_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_9, size_130, None), slice(None, size_130, None))]; transformer_h_8_attn_bias = sub_9 = size_130 = None\n getattr_68 = truediv_8.dtype\n finfo_8 = torch.finfo(getattr_68); getattr_68 = None\n getattr_69 = finfo_8.min; finfo_8 = None\n getattr_70 = truediv_8.dtype\n full_17 = torch.full([], getattr_69, dtype = getattr_70); getattr_69 = getattr_70 = None\n getattr_71 = truediv_8.device\n to_17 = full_17.to(getattr_71); full_17 = getattr_71 = None\n getattr_72 = truediv_8.dtype\n to_18 = truediv_8.to(getattr_72); truediv_8 = getattr_72 = None\n where_8 = torch.where(getitem_108, to_18, to_17); getitem_108 = to_18 = to_17 = None\n add_111 = where_8 + mul; where_8 = None\n softmax_8 = torch.nn.functional.softmax(add_111, dim = -1, _stacklevel = 3, dtype = None); add_111 = None\n getattr_73 = permute_34.dtype\n type_9 = softmax_8.type(getattr_73); softmax_8 = getattr_73 = None\n transformer_h_8_attn_attn_dropout = getattr(self.transformer.h, "8").attn.attn_dropout(type_9); type_9 = None\n matmul_17 = torch.matmul(transformer_h_8_attn_attn_dropout, permute_34); transformer_h_8_attn_attn_dropout = None\n permute_35 = matmul_17.permute(0, 2, 1, 3); matmul_17 = None\n contiguous_8 = permute_35.contiguous(); permute_35 = None\n size_131 = contiguous_8.size()\n getitem_109 = size_131[slice(None, -2, None)]; size_131 = None\n add_112 = getitem_109 + (768,); getitem_109 = None\n view_104 = contiguous_8.view(add_112); contiguous_8 = add_112 = None\n size_132 = view_104.size()\n getitem_110 = size_132[slice(None, -1, None)]; size_132 = None\n add_113 = getitem_110 + (768,); getitem_110 = None\n transformer_h_8_attn_c_proj_bias = getattr(self.transformer.h, "8").attn.c_proj.bias\n size_133 = view_104.size(-1)\n view_105 = view_104.view(-1, size_133); view_104 = size_133 = None\n transformer_h_8_attn_c_proj_weight = getattr(self.transformer.h, "8").attn.c_proj.weight\n addmm_33 = torch.addmm(transformer_h_8_attn_c_proj_bias, view_105, transformer_h_8_attn_c_proj_weight); transformer_h_8_attn_c_proj_bias = view_105 = transformer_h_8_attn_c_proj_weight = None\n view_106 = addmm_33.view(add_113); addmm_33 = add_113 = None\n transformer_h_8_attn_resid_dropout = getattr(self.transformer.h, "8").attn.resid_dropout(view_106); view_106 = None\n add_114 = transformer_h_8_attn_resid_dropout + add_106; transformer_h_8_attn_resid_dropout = add_106 = None\n transformer_h_8_ln_2 = getattr(self.transformer.h, "8").ln_2(add_114)\n size_134 = transformer_h_8_ln_2.size()\n getitem_111 = size_134[slice(None, -1, None)]; size_134 = None\n add_115 = getitem_111 + (3072,); getitem_111 = None\n transformer_h_8_mlp_c_fc_bias = getattr(self.transformer.h, "8").mlp.c_fc.bias\n size_135 = transformer_h_8_ln_2.size(-1)\n view_107 = transformer_h_8_ln_2.view(-1, size_135); transformer_h_8_ln_2 = size_135 = None\n transformer_h_8_mlp_c_fc_weight = getattr(self.transformer.h, "8").mlp.c_fc.weight\n addmm_34 = torch.addmm(transformer_h_8_mlp_c_fc_bias, view_107, transformer_h_8_mlp_c_fc_weight); transformer_h_8_mlp_c_fc_bias = view_107 = transformer_h_8_mlp_c_fc_weight = None\n view_108 = addmm_34.view(add_115); addmm_34 = add_115 = None\n mul_33 = 0.5 * view_108\n pow_18 = torch.pow(view_108, 3.0)\n mul_34 = 0.044715 * pow_18; pow_18 = None\n add_116 = view_108 + mul_34; view_108 = mul_34 = None\n mul_35 = 0.7978845608028654 * add_116; add_116 = None\n tanh_8 = torch.tanh(mul_35); mul_35 = None\n add_117 = 1.0 + tanh_8; tanh_8 = None\n mul_36 = mul_33 * add_117; mul_33 = add_117 = None\n size_136 = mul_36.size()\n getitem_112 = size_136[slice(None, -1, None)]; size_136 = None\n add_118 = getitem_112 + (768,); getitem_112 = None\n transformer_h_8_mlp_c_proj_bias = getattr(self.transformer.h, "8").mlp.c_proj.bias\n size_137 = mul_36.size(-1)\n view_109 = mul_36.view(-1, size_137); mul_36 = size_137 = None\n transformer_h_8_mlp_c_proj_weight = getattr(self.transformer.h, "8").mlp.c_proj.weight\n addmm_35 = torch.addmm(transformer_h_8_mlp_c_proj_bias, view_109, transformer_h_8_mlp_c_proj_weight); transformer_h_8_mlp_c_proj_bias = view_109 = transformer_h_8_mlp_c_proj_weight = None\n view_110 = addmm_35.view(add_118); addmm_35 = add_118 = None\n transformer_h_8_mlp_dropout = getattr(self.transformer.h, "8").mlp.dropout(view_110); view_110 = None\n add_119 = add_114 + transformer_h_8_mlp_dropout; add_114 = transformer_h_8_mlp_dropout = None\n transformer_h_9_ln_1 = getattr(self.transformer.h, "9").ln_1(add_119)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, add_119, transformer_h_9_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, add_119, transformer_h_9_ln_1):\n # No stacktrace found for following nodes\n size_138 = transformer_h_9_ln_1.size()\n getitem_113 = size_138[slice(None, -1, None)]; size_138 = None\n add_120 = getitem_113 + (2304,); getitem_113 = None\n transformer_h_9_attn_c_attn_bias = getattr(self.transformer.h, "9").attn.c_attn.bias\n size_139 = transformer_h_9_ln_1.size(-1)\n view_111 = transformer_h_9_ln_1.view(-1, size_139); transformer_h_9_ln_1 = size_139 = None\n transformer_h_9_attn_c_attn_weight = getattr(self.transformer.h, "9").attn.c_attn.weight\n addmm_36 = torch.addmm(transformer_h_9_attn_c_attn_bias, view_111, transformer_h_9_attn_c_attn_weight); transformer_h_9_attn_c_attn_bias = view_111 = transformer_h_9_attn_c_attn_weight = None\n view_112 = addmm_36.view(add_120); addmm_36 = add_120 = None\n split_9 = view_112.split(768, dim = 2); view_112 = None\n getitem_114 = split_9[0]\n getitem_115 = split_9[1]\n getitem_116 = split_9[2]; split_9 = None\n size_140 = getitem_114.size()\n getitem_117 = size_140[slice(None, -1, None)]; size_140 = None\n add_121 = getitem_117 + (12, 64); getitem_117 = None\n view_113 = getitem_114.view(add_121); getitem_114 = add_121 = None\n permute_36 = view_113.permute(0, 2, 1, 3); view_113 = None\n size_141 = getitem_115.size()\n getitem_118 = size_141[slice(None, -1, None)]; size_141 = None\n add_122 = getitem_118 + (12, 64); getitem_118 = None\n view_114 = getitem_115.view(add_122); getitem_115 = add_122 = None\n permute_37 = view_114.permute(0, 2, 1, 3); view_114 = None\n size_142 = getitem_116.size()\n getitem_119 = size_142[slice(None, -1, None)]; size_142 = None\n add_123 = getitem_119 + (12, 64); getitem_119 = None\n view_115 = getitem_116.view(add_123); getitem_116 = add_123 = None\n permute_38 = view_115.permute(0, 2, 1, 3); view_115 = None\n transpose_9 = permute_37.transpose(-1, -2)\n matmul_18 = torch.matmul(permute_36, transpose_9); transpose_9 = None\n size_143 = permute_38.size(-1)\n pow_19 = size_143 ** 0.5; size_143 = None\n getattr_74 = matmul_18.dtype\n getattr_75 = matmul_18.device\n full_18 = torch.full([], pow_19, dtype = getattr_74, device = getattr_75); pow_19 = getattr_74 = getattr_75 = None\n truediv_9 = matmul_18 / full_18; matmul_18 = full_18 = None\n size_144 = permute_36.size(-2); permute_36 = None\n size_145 = permute_37.size(-2)\n transformer_h_9_attn_bias = getattr(self.transformer.h, "9").attn.bias\n sub_10 = size_145 - size_144; size_144 = None\n getitem_120 = transformer_h_9_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_10, size_145, None), slice(None, size_145, None))]; transformer_h_9_attn_bias = sub_10 = size_145 = None\n getattr_76 = truediv_9.dtype\n finfo_9 = torch.finfo(getattr_76); getattr_76 = None\n getattr_77 = finfo_9.min; finfo_9 = None\n getattr_78 = truediv_9.dtype\n full_19 = torch.full([], getattr_77, dtype = getattr_78); getattr_77 = getattr_78 = None\n getattr_79 = truediv_9.device\n to_19 = full_19.to(getattr_79); full_19 = getattr_79 = None\n getattr_80 = truediv_9.dtype\n to_20 = truediv_9.to(getattr_80); truediv_9 = getattr_80 = None\n where_9 = torch.where(getitem_120, to_20, to_19); getitem_120 = to_20 = to_19 = None\n add_124 = where_9 + mul; where_9 = None\n softmax_9 = torch.nn.functional.softmax(add_124, dim = -1, _stacklevel = 3, dtype = None); add_124 = None\n getattr_81 = permute_38.dtype\n type_10 = softmax_9.type(getattr_81); softmax_9 = getattr_81 = None\n transformer_h_9_attn_attn_dropout = getattr(self.transformer.h, "9").attn.attn_dropout(type_10); type_10 = None\n matmul_19 = torch.matmul(transformer_h_9_attn_attn_dropout, permute_38); transformer_h_9_attn_attn_dropout = None\n permute_39 = matmul_19.permute(0, 2, 1, 3); matmul_19 = None\n contiguous_9 = permute_39.contiguous(); permute_39 = None\n size_146 = contiguous_9.size()\n getitem_121 = size_146[slice(None, -2, None)]; size_146 = None\n add_125 = getitem_121 + (768,); getitem_121 = None\n view_116 = contiguous_9.view(add_125); contiguous_9 = add_125 = None\n size_147 = view_116.size()\n getitem_122 = size_147[slice(None, -1, None)]; size_147 = None\n add_126 = getitem_122 + (768,); getitem_122 = None\n transformer_h_9_attn_c_proj_bias = getattr(self.transformer.h, "9").attn.c_proj.bias\n size_148 = view_116.size(-1)\n view_117 = view_116.view(-1, size_148); view_116 = size_148 = None\n transformer_h_9_attn_c_proj_weight = getattr(self.transformer.h, "9").attn.c_proj.weight\n addmm_37 = torch.addmm(transformer_h_9_attn_c_proj_bias, view_117, transformer_h_9_attn_c_proj_weight); transformer_h_9_attn_c_proj_bias = view_117 = transformer_h_9_attn_c_proj_weight = None\n view_118 = addmm_37.view(add_126); addmm_37 = add_126 = None\n transformer_h_9_attn_resid_dropout = getattr(self.transformer.h, "9").attn.resid_dropout(view_118); view_118 = None\n add_127 = transformer_h_9_attn_resid_dropout + add_119; transformer_h_9_attn_resid_dropout = add_119 = None\n transformer_h_9_ln_2 = getattr(self.transformer.h, "9").ln_2(add_127)\n size_149 = transformer_h_9_ln_2.size()\n getitem_123 = size_149[slice(None, -1, None)]; size_149 = None\n add_128 = getitem_123 + (3072,); getitem_123 = None\n transformer_h_9_mlp_c_fc_bias = getattr(self.transformer.h, "9").mlp.c_fc.bias\n size_150 = transformer_h_9_ln_2.size(-1)\n view_119 = transformer_h_9_ln_2.view(-1, size_150); transformer_h_9_ln_2 = size_150 = None\n transformer_h_9_mlp_c_fc_weight = getattr(self.transformer.h, "9").mlp.c_fc.weight\n addmm_38 = torch.addmm(transformer_h_9_mlp_c_fc_bias, view_119, transformer_h_9_mlp_c_fc_weight); transformer_h_9_mlp_c_fc_bias = view_119 = transformer_h_9_mlp_c_fc_weight = None\n view_120 = addmm_38.view(add_128); addmm_38 = add_128 = None\n mul_37 = 0.5 * view_120\n pow_20 = torch.pow(view_120, 3.0)\n mul_38 = 0.044715 * pow_20; pow_20 = None\n add_129 = view_120 + mul_38; view_120 = mul_38 = None\n mul_39 = 0.7978845608028654 * add_129; add_129 = None\n tanh_9 = torch.tanh(mul_39); mul_39 = None\n add_130 = 1.0 + tanh_9; tanh_9 = None\n mul_40 = mul_37 * add_130; mul_37 = add_130 = None\n size_151 = mul_40.size()\n getitem_124 = size_151[slice(None, -1, None)]; size_151 = None\n add_131 = getitem_124 + (768,); getitem_124 = None\n transformer_h_9_mlp_c_proj_bias = getattr(self.transformer.h, "9").mlp.c_proj.bias\n size_152 = mul_40.size(-1)\n view_121 = mul_40.view(-1, size_152); mul_40 = size_152 = None\n transformer_h_9_mlp_c_proj_weight = getattr(self.transformer.h, "9").mlp.c_proj.weight\n addmm_39 = torch.addmm(transformer_h_9_mlp_c_proj_bias, view_121, transformer_h_9_mlp_c_proj_weight); transformer_h_9_mlp_c_proj_bias = view_121 = transformer_h_9_mlp_c_proj_weight = None\n view_122 = addmm_39.view(add_131); addmm_39 = add_131 = None\n transformer_h_9_mlp_dropout = getattr(self.transformer.h, "9").mlp.dropout(view_122); view_122 = None\n add_132 = add_127 + transformer_h_9_mlp_dropout; add_127 = transformer_h_9_mlp_dropout = None\n transformer_h_10_ln_1 = getattr(self.transformer.h, "10").ln_1(add_132)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, add_132, transformer_h_10_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, add_132, transformer_h_10_ln_1):\n # No stacktrace found for following nodes\n size_153 = transformer_h_10_ln_1.size()\n getitem_125 = size_153[slice(None, -1, None)]; size_153 = None\n add_133 = getitem_125 + (2304,); getitem_125 = None\n transformer_h_10_attn_c_attn_bias = getattr(self.transformer.h, "10").attn.c_attn.bias\n size_154 = transformer_h_10_ln_1.size(-1)\n view_123 = transformer_h_10_ln_1.view(-1, size_154); transformer_h_10_ln_1 = size_154 = None\n transformer_h_10_attn_c_attn_weight = getattr(self.transformer.h, "10").attn.c_attn.weight\n addmm_40 = torch.addmm(transformer_h_10_attn_c_attn_bias, view_123, transformer_h_10_attn_c_attn_weight); transformer_h_10_attn_c_attn_bias = view_123 = transformer_h_10_attn_c_attn_weight = None\n view_124 = addmm_40.view(add_133); addmm_40 = add_133 = None\n split_10 = view_124.split(768, dim = 2); view_124 = None\n getitem_126 = split_10[0]\n getitem_127 = split_10[1]\n getitem_128 = split_10[2]; split_10 = None\n size_155 = getitem_126.size()\n getitem_129 = size_155[slice(None, -1, None)]; size_155 = None\n add_134 = getitem_129 + (12, 64); getitem_129 = None\n view_125 = getitem_126.view(add_134); getitem_126 = add_134 = None\n permute_40 = view_125.permute(0, 2, 1, 3); view_125 = None\n size_156 = getitem_127.size()\n getitem_130 = size_156[slice(None, -1, None)]; size_156 = None\n add_135 = getitem_130 + (12, 64); getitem_130 = None\n view_126 = getitem_127.view(add_135); getitem_127 = add_135 = None\n permute_41 = view_126.permute(0, 2, 1, 3); view_126 = None\n size_157 = getitem_128.size()\n getitem_131 = size_157[slice(None, -1, None)]; size_157 = None\n add_136 = getitem_131 + (12, 64); getitem_131 = None\n view_127 = getitem_128.view(add_136); getitem_128 = add_136 = None\n permute_42 = view_127.permute(0, 2, 1, 3); view_127 = None\n transpose_10 = permute_41.transpose(-1, -2)\n matmul_20 = torch.matmul(permute_40, transpose_10); transpose_10 = None\n size_158 = permute_42.size(-1)\n pow_21 = size_158 ** 0.5; size_158 = None\n getattr_82 = matmul_20.dtype\n getattr_83 = matmul_20.device\n full_20 = torch.full([], pow_21, dtype = getattr_82, device = getattr_83); pow_21 = getattr_82 = getattr_83 = None\n truediv_10 = matmul_20 / full_20; matmul_20 = full_20 = None\n size_159 = permute_40.size(-2); permute_40 = None\n size_160 = permute_41.size(-2)\n transformer_h_10_attn_bias = getattr(self.transformer.h, "10").attn.bias\n sub_11 = size_160 - size_159; size_159 = None\n getitem_132 = transformer_h_10_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_11, size_160, None), slice(None, size_160, None))]; transformer_h_10_attn_bias = sub_11 = size_160 = None\n getattr_84 = truediv_10.dtype\n finfo_10 = torch.finfo(getattr_84); getattr_84 = None\n getattr_85 = finfo_10.min; finfo_10 = None\n getattr_86 = truediv_10.dtype\n full_21 = torch.full([], getattr_85, dtype = getattr_86); getattr_85 = getattr_86 = None\n getattr_87 = truediv_10.device\n to_21 = full_21.to(getattr_87); full_21 = getattr_87 = None\n getattr_88 = truediv_10.dtype\n to_22 = truediv_10.to(getattr_88); truediv_10 = getattr_88 = None\n where_10 = torch.where(getitem_132, to_22, to_21); getitem_132 = to_22 = to_21 = None\n add_137 = where_10 + mul; where_10 = None\n softmax_10 = torch.nn.functional.softmax(add_137, dim = -1, _stacklevel = 3, dtype = None); add_137 = None\n getattr_89 = permute_42.dtype\n type_11 = softmax_10.type(getattr_89); softmax_10 = getattr_89 = None\n transformer_h_10_attn_attn_dropout = getattr(self.transformer.h, "10").attn.attn_dropout(type_11); type_11 = None\n matmul_21 = torch.matmul(transformer_h_10_attn_attn_dropout, permute_42); transformer_h_10_attn_attn_dropout = None\n permute_43 = matmul_21.permute(0, 2, 1, 3); matmul_21 = None\n contiguous_10 = permute_43.contiguous(); permute_43 = None\n size_161 = contiguous_10.size()\n getitem_133 = size_161[slice(None, -2, None)]; size_161 = None\n add_138 = getitem_133 + (768,); getitem_133 = None\n view_128 = contiguous_10.view(add_138); contiguous_10 = add_138 = None\n size_162 = view_128.size()\n getitem_134 = size_162[slice(None, -1, None)]; size_162 = None\n add_139 = getitem_134 + (768,); getitem_134 = None\n transformer_h_10_attn_c_proj_bias = getattr(self.transformer.h, "10").attn.c_proj.bias\n size_163 = view_128.size(-1)\n view_129 = view_128.view(-1, size_163); view_128 = size_163 = None\n transformer_h_10_attn_c_proj_weight = getattr(self.transformer.h, "10").attn.c_proj.weight\n addmm_41 = torch.addmm(transformer_h_10_attn_c_proj_bias, view_129, transformer_h_10_attn_c_proj_weight); transformer_h_10_attn_c_proj_bias = view_129 = transformer_h_10_attn_c_proj_weight = None\n view_130 = addmm_41.view(add_139); addmm_41 = add_139 = None\n transformer_h_10_attn_resid_dropout = getattr(self.transformer.h, "10").attn.resid_dropout(view_130); view_130 = None\n add_140 = transformer_h_10_attn_resid_dropout + add_132; transformer_h_10_attn_resid_dropout = add_132 = None\n transformer_h_10_ln_2 = getattr(self.transformer.h, "10").ln_2(add_140)\n size_164 = transformer_h_10_ln_2.size()\n getitem_135 = size_164[slice(None, -1, None)]; size_164 = None\n add_141 = getitem_135 + (3072,); getitem_135 = None\n transformer_h_10_mlp_c_fc_bias = getattr(self.transformer.h, "10").mlp.c_fc.bias\n size_165 = transformer_h_10_ln_2.size(-1)\n view_131 = transformer_h_10_ln_2.view(-1, size_165); transformer_h_10_ln_2 = size_165 = None\n transformer_h_10_mlp_c_fc_weight = getattr(self.transformer.h, "10").mlp.c_fc.weight\n addmm_42 = torch.addmm(transformer_h_10_mlp_c_fc_bias, view_131, transformer_h_10_mlp_c_fc_weight); transformer_h_10_mlp_c_fc_bias = view_131 = transformer_h_10_mlp_c_fc_weight = None\n view_132 = addmm_42.view(add_141); addmm_42 = add_141 = None\n mul_41 = 0.5 * view_132\n pow_22 = torch.pow(view_132, 3.0)\n mul_42 = 0.044715 * pow_22; pow_22 = None\n add_142 = view_132 + mul_42; view_132 = mul_42 = None\n mul_43 = 0.7978845608028654 * add_142; add_142 = None\n tanh_10 = torch.tanh(mul_43); mul_43 = None\n add_143 = 1.0 + tanh_10; tanh_10 = None\n mul_44 = mul_41 * add_143; mul_41 = add_143 = None\n size_166 = mul_44.size()\n getitem_136 = size_166[slice(None, -1, None)]; size_166 = None\n add_144 = getitem_136 + (768,); getitem_136 = None\n transformer_h_10_mlp_c_proj_bias = getattr(self.transformer.h, "10").mlp.c_proj.bias\n size_167 = mul_44.size(-1)\n view_133 = mul_44.view(-1, size_167); mul_44 = size_167 = None\n transformer_h_10_mlp_c_proj_weight = getattr(self.transformer.h, "10").mlp.c_proj.weight\n addmm_43 = torch.addmm(transformer_h_10_mlp_c_proj_bias, view_133, transformer_h_10_mlp_c_proj_weight); transformer_h_10_mlp_c_proj_bias = view_133 = transformer_h_10_mlp_c_proj_weight = None\n view_134 = addmm_43.view(add_144); addmm_43 = add_144 = None\n transformer_h_10_mlp_dropout = getattr(self.transformer.h, "10").mlp.dropout(view_134); view_134 = None\n add_145 = add_140 + transformer_h_10_mlp_dropout; add_140 = transformer_h_10_mlp_dropout = None\n transformer_h_11_ln_1 = getattr(self.transformer.h, "11").ln_1(add_145)\n return (labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, permute_41, permute_42, add_145, transformer_h_11_ln_1)\n ', 'class GraphModule(torch.nn.Module):\n def forward(self, labels, mul, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, permute_41, permute_42, add_145, transformer_h_11_ln_1):\n # No stacktrace found for following nodes\n size_168 = transformer_h_11_ln_1.size()\n getitem_137 = size_168[slice(None, -1, None)]; size_168 = None\n add_146 = getitem_137 + (2304,); getitem_137 = None\n transformer_h_11_attn_c_attn_bias = getattr(self.transformer.h, "11").attn.c_attn.bias\n size_169 = transformer_h_11_ln_1.size(-1)\n view_135 = transformer_h_11_ln_1.view(-1, size_169); transformer_h_11_ln_1 = size_169 = None\n transformer_h_11_attn_c_attn_weight = getattr(self.transformer.h, "11").attn.c_attn.weight\n addmm_44 = torch.addmm(transformer_h_11_attn_c_attn_bias, view_135, transformer_h_11_attn_c_attn_weight); transformer_h_11_attn_c_attn_bias = view_135 = transformer_h_11_attn_c_attn_weight = None\n view_136 = addmm_44.view(add_146); addmm_44 = add_146 = None\n split_11 = view_136.split(768, dim = 2); view_136 = None\n getitem_138 = split_11[0]\n getitem_139 = split_11[1]\n getitem_140 = split_11[2]; split_11 = None\n size_170 = getitem_138.size()\n getitem_141 = size_170[slice(None, -1, None)]; size_170 = None\n add_147 = getitem_141 + (12, 64); getitem_141 = None\n view_137 = getitem_138.view(add_147); getitem_138 = add_147 = None\n permute_44 = view_137.permute(0, 2, 1, 3); view_137 = None\n size_171 = getitem_139.size()\n getitem_142 = size_171[slice(None, -1, None)]; size_171 = None\n add_148 = getitem_142 + (12, 64); getitem_142 = None\n view_138 = getitem_139.view(add_148); getitem_139 = add_148 = None\n permute_45 = view_138.permute(0, 2, 1, 3); view_138 = None\n size_172 = getitem_140.size()\n getitem_143 = size_172[slice(None, -1, None)]; size_172 = None\n add_149 = getitem_143 + (12, 64); getitem_143 = None\n view_139 = getitem_140.view(add_149); getitem_140 = add_149 = None\n permute_46 = view_139.permute(0, 2, 1, 3); view_139 = None\n transpose_11 = permute_45.transpose(-1, -2)\n matmul_22 = torch.matmul(permute_44, transpose_11); transpose_11 = None\n size_173 = permute_46.size(-1)\n pow_23 = size_173 ** 0.5; size_173 = None\n getattr_90 = matmul_22.dtype\n getattr_91 = matmul_22.device\n full_22 = torch.full([], pow_23, dtype = getattr_90, device = getattr_91); pow_23 = getattr_90 = getattr_91 = None\n truediv_11 = matmul_22 / full_22; matmul_22 = full_22 = None\n size_174 = permute_44.size(-2); permute_44 = None\n size_175 = permute_45.size(-2)\n transformer_h_11_attn_bias = getattr(self.transformer.h, "11").attn.bias\n sub_12 = size_175 - size_174; size_174 = None\n getitem_144 = transformer_h_11_attn_bias[(slice(None, None, None), slice(None, None, None), slice(sub_12, size_175, None), slice(None, size_175, None))]; transformer_h_11_attn_bias = sub_12 = size_175 = None\n getattr_92 = truediv_11.dtype\n finfo_11 = torch.finfo(getattr_92); getattr_92 = None\n getattr_93 = finfo_11.min; finfo_11 = None\n getattr_94 = truediv_11.dtype\n full_23 = torch.full([], getattr_93, dtype = getattr_94); getattr_93 = getattr_94 = None\n getattr_95 = truediv_11.device\n to_23 = full_23.to(getattr_95); full_23 = getattr_95 = None\n getattr_96 = truediv_11.dtype\n to_24 = truediv_11.to(getattr_96); truediv_11 = getattr_96 = None\n where_11 = torch.where(getitem_144, to_24, to_23); getitem_144 = to_24 = to_23 = None\n add_150 = where_11 + mul; where_11 = mul = None\n softmax_11 = torch.nn.functional.softmax(add_150, dim = -1, _stacklevel = 3, dtype = None); add_150 = None\n getattr_97 = permute_46.dtype\n type_12 = softmax_11.type(getattr_97); softmax_11 = getattr_97 = None\n transformer_h_11_attn_attn_dropout = getattr(self.transformer.h, "11").attn.attn_dropout(type_12); type_12 = None\n matmul_23 = torch.matmul(transformer_h_11_attn_attn_dropout, permute_46); transformer_h_11_attn_attn_dropout = None\n permute_47 = matmul_23.permute(0, 2, 1, 3); matmul_23 = None\n contiguous_11 = permute_47.contiguous(); permute_47 = None\n size_176 = contiguous_11.size()\n getitem_145 = size_176[slice(None, -2, None)]; size_176 = None\n add_151 = getitem_145 + (768,); getitem_145 = None\n view_140 = contiguous_11.view(add_151); contiguous_11 = add_151 = None\n size_177 = view_140.size()\n getitem_146 = size_177[slice(None, -1, None)]; size_177 = None\n add_152 = getitem_146 + (768,); getitem_146 = None\n transformer_h_11_attn_c_proj_bias = getattr(self.transformer.h, "11").attn.c_proj.bias\n size_178 = view_140.size(-1)\n view_141 = view_140.view(-1, size_178); view_140 = size_178 = None\n transformer_h_11_attn_c_proj_weight = getattr(self.transformer.h, "11").attn.c_proj.weight\n addmm_45 = torch.addmm(transformer_h_11_attn_c_proj_bias, view_141, transformer_h_11_attn_c_proj_weight); transformer_h_11_attn_c_proj_bias = view_141 = transformer_h_11_attn_c_proj_weight = None\n view_142 = addmm_45.view(add_152); addmm_45 = add_152 = None\n transformer_h_11_attn_resid_dropout = getattr(self.transformer.h, "11").attn.resid_dropout(view_142); view_142 = None\n add_153 = transformer_h_11_attn_resid_dropout + add_145; transformer_h_11_attn_resid_dropout = add_145 = None\n transformer_h_11_ln_2 = getattr(self.transformer.h, "11").ln_2(add_153)\n size_179 = transformer_h_11_ln_2.size()\n getitem_147 = size_179[slice(None, -1, None)]; size_179 = None\n add_154 = getitem_147 + (3072,); getitem_147 = None\n transformer_h_11_mlp_c_fc_bias = getattr(self.transformer.h, "11").mlp.c_fc.bias\n size_180 = transformer_h_11_ln_2.size(-1)\n view_143 = transformer_h_11_ln_2.view(-1, size_180); transformer_h_11_ln_2 = size_180 = None\n transformer_h_11_mlp_c_fc_weight = getattr(self.transformer.h, "11").mlp.c_fc.weight\n addmm_46 = torch.addmm(transformer_h_11_mlp_c_fc_bias, view_143, transformer_h_11_mlp_c_fc_weight); transformer_h_11_mlp_c_fc_bias = view_143 = transformer_h_11_mlp_c_fc_weight = None\n view_144 = addmm_46.view(add_154); addmm_46 = add_154 = None\n mul_45 = 0.5 * view_144\n pow_24 = torch.pow(view_144, 3.0)\n mul_46 = 0.044715 * pow_24; pow_24 = None\n add_155 = view_144 + mul_46; view_144 = mul_46 = None\n mul_47 = 0.7978845608028654 * add_155; add_155 = None\n tanh_11 = torch.tanh(mul_47); mul_47 = None\n add_156 = 1.0 + tanh_11; tanh_11 = None\n mul_48 = mul_45 * add_156; mul_45 = add_156 = None\n size_181 = mul_48.size()\n getitem_148 = size_181[slice(None, -1, None)]; size_181 = None\n add_157 = getitem_148 + (768,); getitem_148 = None\n transformer_h_11_mlp_c_proj_bias = getattr(self.transformer.h, "11").mlp.c_proj.bias\n size_182 = mul_48.size(-1)\n view_145 = mul_48.view(-1, size_182); mul_48 = size_182 = None\n transformer_h_11_mlp_c_proj_weight = getattr(self.transformer.h, "11").mlp.c_proj.weight\n addmm_47 = torch.addmm(transformer_h_11_mlp_c_proj_bias, view_145, transformer_h_11_mlp_c_proj_weight); transformer_h_11_mlp_c_proj_bias = view_145 = transformer_h_11_mlp_c_proj_weight = None\n view_146 = addmm_47.view(add_157); addmm_47 = add_157 = None\n transformer_h_11_mlp_dropout = getattr(self.transformer.h, "11").mlp.dropout(view_146); view_146 = None\n add_158 = add_153 + transformer_h_11_mlp_dropout; add_153 = transformer_h_11_mlp_dropout = None\n transformer_ln_f = self.transformer.ln_f(add_158); add_158 = None\n return (labels, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, permute_41, permute_42, permute_45, permute_46, transformer_ln_f)\n ', "class GraphModule(torch.nn.Module):\n def forward(self, labels, add_2, permute_1, permute_2, permute_5, permute_6, permute_9, permute_10, permute_13, permute_14, permute_17, permute_18, permute_21, permute_22, permute_25, permute_26, permute_29, permute_30, permute_33, permute_34, permute_37, permute_38, permute_41, permute_42, permute_45, permute_46, transformer_ln_f):\n # No stacktrace found for following nodes\n view_147 = transformer_ln_f.view(add_2); transformer_ln_f = add_2 = None\n lm_head = self.lm_head(view_147); view_147 = None\n getitem_149 = lm_head[(Ellipsis, slice(None, -1, None), slice(None, None, None))]\n contiguous_12 = getitem_149.contiguous(); getitem_149 = None\n getitem_150 = labels[(Ellipsis, slice(1, None, None))]; labels = None\n contiguous_13 = getitem_150.contiguous(); getitem_150 = None\n size_183 = contiguous_12.size(-1)\n view_148 = contiguous_12.view(-1, size_183); contiguous_12 = size_183 = None\n view_149 = contiguous_13.view(-1); contiguous_13 = None\n crossentropyloss_0 = self.crossentropyloss_0(view_148, view_149); view_148 = view_149 = None\n return {'loss': crossentropyloss_0, 'logits': lm_head, 'past_key_values': ((permute_1, permute_2), (permute_5, permute_6), (permute_9, permute_10), (permute_13, permute_14), (permute_17, permute_18), (permute_21, permute_22), (permute_25, permute_26), (permute_29, permute_30), (permute_33, permute_34), (permute_37, permute_38), (permute_41, permute_42), (permute_45, permute_46))}\n "]