ByteDance
/

ListConRanker

@@ -30,6 +30,9 @@ from transformers import (
 import os
 from transformers.modeling_outputs import SequenceClassifierOutput
 from typing import Union, List, Optional
 class ListConRankerConfig(BertConfig):
@@ -295,14 +298,15 @@ class ListConRankerModel(PreTrainedModel):
             if sep_idxs.numel() == 0:
                 raise ValueError(f"No SEP in sequence {idx}")
             first_sep = sep_idxs[0].item()
             # Extract query and passage
             q_seq = seq[: first_sep + 1]
             q_mask = mask[: first_sep + 1]
             q_tt = torch.zeros_like(q_seq)
-            p_seq = seq[first_sep:]
-            p_mask = mask[first_sep:]
             p_seq = p_seq.clone()
             p_seq[0] = self.config.cls_token_id
             p_tt = torch.zeros_like(p_seq)
@@ -315,6 +319,16 @@ class ListConRankerModel(PreTrainedModel):
                 ].tolist()
             )
             if key not in grouped:
                 grouped[key] = {
                     "query": (q_seq, q_mask, q_tt),
@@ -396,7 +410,7 @@ class ListConRankerModel(PreTrainedModel):
     ):
         model = super().from_pretrained(model_name_or_path, config=config, **kwargs)
         model.hf_model = BertModel.from_pretrained(
-            model_name_or_path, config=model.config.bert_config
         )
         linear_path = os.path.join(model_name_or_path, "linear_in_embedding.pt")
@@ -439,11 +453,94 @@ class ListConRankerModel(PreTrainedModel):
             inputs = tokenizer(
                 batch_pairs,
                 padding=True,
-                truncation=True,
                 return_tensors="pt",
             )
             logits = self(**inputs)[0]
             total_logits[batch * batch_size : (batch + 1) * batch_size] = (
                 logits.squeeze(1)
             )
         return total_logits

 import os
 from transformers.modeling_outputs import SequenceClassifierOutput
 from typing import Union, List, Optional
+from collections import defaultdict
+import numpy as np
+import math
 class ListConRankerConfig(BertConfig):
             if sep_idxs.numel() == 0:
                 raise ValueError(f"No SEP in sequence {idx}")
             first_sep = sep_idxs[0].item()
+            second_sep = sep_idxs[1].item()
             # Extract query and passage
             q_seq = seq[: first_sep + 1]
             q_mask = mask[: first_sep + 1]
             q_tt = torch.zeros_like(q_seq)
+            p_seq = seq[first_sep : second_sep + 1]
+            p_mask = mask[first_sep : second_sep + 1]
             p_seq = p_seq.clone()
             p_seq[0] = self.config.cls_token_id
             p_tt = torch.zeros_like(p_seq)
                 ].tolist()
             )
+            # truncation
+            q_seq = q_seq[: self.config.max_position_embeddings]
+            q_seq[-1] = self.config.sep_token_id
+            p_seq = p_seq[: self.config.max_position_embeddings]
+            p_seq[-1] = self.config.sep_token_id
+            q_mask = q_mask[: self.config.max_position_embeddings]
+            p_mask = p_mask[: self.config.max_position_embeddings]
+            q_tt = q_tt[: self.config.max_position_embeddings]
+            p_tt = p_tt[: self.config.max_position_embeddings]
             if key not in grouped:
                 grouped[key] = {
                     "query": (q_seq, q_mask, q_tt),
     ):
         model = super().from_pretrained(model_name_or_path, config=config, **kwargs)
         model.hf_model = BertModel.from_pretrained(
+            model_name_or_path, config=model.config.bert_config, **kwargs
         )
         linear_path = os.path.join(model_name_or_path, "linear_in_embedding.pt")
             inputs = tokenizer(
                 batch_pairs,
                 padding=True,
+                truncation=False,
                 return_tensors="pt",
             )
+            for k, v in inputs.items():
+                inputs[k] = v.to(self.device)
             logits = self(**inputs)[0]
             total_logits[batch * batch_size : (batch + 1) * batch_size] = (
                 logits.squeeze(1)
             )
         return total_logits
+    def multi_passage_in_iterative_inference(
+        self,
+        sentences: List[str],
+        stop_num: int = 20,
+        decrement_rate: float = 0.2,
+        min_filter_num: int = 10,
+        tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(
+            "ByteDance/ListConRanker"
+        ),
+    ):
+        """
+        Process multiple passages for one query in iterative inference.
+        :param sentences: List contains sentences for a query.
+        :return: Tensor of logits for each passage.
+        """
+        if stop_num < 1:
+            raise ValueError("stop_num must be greater than 0")
+        if decrement_rate <= 0 or decrement_rate >= 1:
+            raise ValueError("decrement_rate must be in (0, 1)")
+        if min_filter_num < 1:
+            raise ValueError("min_filter_num must be greater than 0")
+        query = sentences[0]
+        passage = sentences[1:]
+        filter_times = 0
+        passage2score = defaultdict(list)
+        while len(passage) > stop_num:
+            batch = [[query] + passage]
+            pred_scores = self.multi_passage(
+                batch, batch_size=len(batch[0]) - 1, tokenizer=tokenizer
+            ).tolist()
+            pred_scores_argsort = np.argsort(
+                pred_scores
+            ).tolist()  # Sort in increasing order
+            passage_len = len(passage)
+            to_filter_num = math.ceil(passage_len * decrement_rate)
+            if to_filter_num < min_filter_num:
+                to_filter_num = min_filter_num
+            have_filter_num = 0
+            while have_filter_num < to_filter_num:
+                idx = pred_scores_argsort[have_filter_num]
+                passage2score[passage[idx]].append(pred_scores[idx] + filter_times)
+                have_filter_num += 1
+            while (
+                pred_scores[pred_scores_argsort[have_filter_num - 1]]
+                == pred_scores[pred_scores_argsort[have_filter_num]]
+            ):
+                idx = pred_scores_argsort[have_filter_num]
+                passage2score[passage[idx]].append(pred_scores[idx] + filter_times)
+                have_filter_num += 1
+            next_passage = []
+            next_passage_idx = have_filter_num
+            while next_passage_idx < len(passage):
+                idx = pred_scores_argsort[next_passage_idx]
+                next_passage.append(passage[idx])
+                next_passage_idx += 1
+            passage = next_passage
+            filter_times += 1
+        batch = [[query] + passage]
+        pred_scores = self.multi_passage(
+            batch, batch_size=len(batch[0]) - 1, tokenizer=tokenizer
+        ).tolist()
+        cnt = 0
+        while cnt < len(passage):
+            passage2score[passage[cnt]].append(pred_scores[cnt] + filter_times)
+            cnt += 1
+        passage = sentences[1:]
+        final_score = []
+        for i in range(len(passage)):
+            p = passage[i]
+            final_score.append(passage2score[p][0])
+        return final_score