diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py index e977005588..fb4364602a 100644 --- a/lmdeploy/tokenizer.py +++ b/lmdeploy/tokenizer.py @@ -624,7 +624,14 @@ def encode(self, Returns: list[int]: token ids """ - return self.model.encode(s, add_bos, add_special_tokens, **kwargs) + encoded = self.model.encode(s, add_bos, add_special_tokens, **kwargs) + if encoded[:2] == [self.bos_token_id] * 2: + get_logger('lmdeploy').warn( + f'Detected duplicate bos token {self.bos_token_id} in prompt, ' + 'this will likely reduce response quality, one of them will be' + 'removed') + encoded = encoded[1:] + return encoded def decode( self,