test

remove other add max test fix bug add no prompt cache add no prompt cache add no prompt cache add no prompt cache add no prompt cache add time cost test add time cost test add autotune add autotune add autotune add autotune add autotune add autotune add autotune add all add all add all
ModelTC · Sep 25, 2024 · 3854d32 · 3854d32
1 parent 377a882
commit 3854d32
Show file tree

Hide file tree

Showing 3 changed files with 634 additions and 171 deletions.
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -143,6 +143,7 @@ def _context_attention_kernel(
         o_tensor = self.alloc_tensor(q.shape, q.dtype) if out is None else out
         if infer_state.use_dynamic_prompt_cache:
             kv = infer_state.mem_manager.kv_buffer[self.layer_num_]
+
             context_attention_fwd(
                 q.view(-1, self.tp_q_head_num_, self.head_dim_),
                 kv[:, 0 : self.tp_k_head_num_, :],