能方便把你的测试代码共享一下吗
# chat
max_length = 2048
top_p = 0.01 # 越小越确定
temperature = 0.01 # 越小越确定
import time
t1 = time.time()
for _ in range(10):
    input =  'xxxx'
    response, history = model.chat(tokenizer, input, history=[], max_length=max_length, top_p=top_p, temperature=temperature, do_sample=False)
    print(input, end=' ')
    gprint(response)
print('耗时:', time.time()-t1)
# generate
max_length = 2048
top_p = 0.01 # 越小越确定
temperature = 0.01 # 越小越确定
t1 = time.time()
for _ in range(10):
    input =  'xxx '
    inputs = tokenizer(
            input,
            max_length=max_length,
            return_tensors="pt"
        )
    inputs = inputs.to('cuda')
    gen_kwargs = {"max_length": max_length, "num_beams": 1, "do_sample": False, "top_p": 0.01,
                      "temperature": 0.01, "logits_processor": None, "use_cache": True}
    outputs = model.generate(**inputs, **gen_kwargs)
    print(tokenizer.decode(outputs[0]))
print('耗时:', time.time()-t1)