From 5bbe208ba0e8693bf65ee4051b3a1f907db1ca1a Mon Sep 17 00:00:00 2001 From: AllentDan Date: Mon, 4 Nov 2024 19:34:08 +0800 Subject: [PATCH 1/2] Remove one of the duplicate bos tokens --- lmdeploy/tokenizer.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py index e977005588..b65c6b6904 100644 --- a/lmdeploy/tokenizer.py +++ b/lmdeploy/tokenizer.py @@ -624,7 +624,14 @@ def encode(self, Returns: list[int]: token ids """ - return self.model.encode(s, add_bos, add_special_tokens, **kwargs) + encoded = self.model.encode(s, add_bos, add_special_tokens, **kwargs) + if encoded[:2] == [self.bos_token_id] * 2: + get_logger('lmdeploy').warn( + f'Detected duplicate leading {self.bos_token_id} in prompt, ' + 'this will likely reduce response quality, one of them will be' + 'removed') + encoded = encoded[1:] + return encoded def decode( self, From 135cb2f9c4ff59b7feca45870ebff97be6abad6a Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:49:36 +0800 Subject: [PATCH 2/2] Update tokenizer.py --- lmdeploy/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py index b65c6b6904..fb4364602a 100644 --- a/lmdeploy/tokenizer.py +++ b/lmdeploy/tokenizer.py @@ -627,7 +627,7 @@ def encode(self, encoded = self.model.encode(s, add_bos, add_special_tokens, **kwargs) if encoded[:2] == [self.bos_token_id] * 2: get_logger('lmdeploy').warn( - f'Detected duplicate leading {self.bos_token_id} in prompt, ' + f'Detected duplicate bos token {self.bos_token_id} in prompt, ' 'this will likely reduce response quality, one of them will be' 'removed') encoded = encoded[1:]