diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 023b5478..87e2d42a 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -209,7 +209,7 @@ def __init__( embed_dim, 3 * embed_dim, process_group, - bias=False, + bias=True, sequence_parallel=gpc.config.parallel.sequence_parallel, **factory_kwargs, ) # according to https://spaces.ac.cn/archives/9577 @@ -232,7 +232,7 @@ def __init__( embed_dim, embed_dim, process_group, - bias=False, + bias=True, sequence_parallel=gpc.config.parallel.sequence_parallel, **factory_kwargs, )