tararad commited on
Commit
f14f7ab
·
verified ·
1 Parent(s): 5cc118f

Update detector.py

Browse files
Files changed (1) hide show
  1. detector.py +46 -35
detector.py CHANGED
@@ -1,56 +1,67 @@
1
- # detector.py
2
  import torch
3
  import torch.nn.functional as F
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
5
 
6
  class CustomDetector:
7
  def __init__(self, model_name="tiiuae/falcon-rw-1b", max_length=512, batch_size=80):
8
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
9
- self.num_gpus = torch.cuda.device_count()
10
  self.model_name = model_name
11
  self.max_length = max_length
12
  self.batch_size = batch_size
13
 
14
- # Load tokenizer and model
15
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
16
- self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
 
 
17
 
18
- if self.num_gpus > 1:
19
- self.model = torch.nn.DataParallel(self.model)
20
  self.model.to(self.device)
21
  self.model.eval()
22
 
23
  if self.tokenizer.pad_token is None:
24
  self.tokenizer.pad_token = self.tokenizer.eos_token
25
 
26
- def compute_score(self, texts):
27
  if isinstance(texts, str):
28
  texts = [texts]
29
 
30
- with torch.no_grad():
31
- tokenized = self.tokenizer(
32
- texts,
33
- truncation=True,
34
- padding=True,
35
- max_length=self.max_length,
36
- return_tensors="pt",
37
- )
38
- tokenized = {k: v.to(self.device) for k, v in tokenized.items()}
39
- input_ids = tokenized["input_ids"]
40
- attention_mask = tokenized["attention_mask"]
41
-
42
- outputs = self.model(**tokenized)
43
- logits = outputs.logits[:, :-1, :]
44
- labels = tokenized["input_ids"][:, 1:]
45
-
46
- log_probs = F.log_softmax(logits, dim=-1)
47
- ll_per_token = log_probs.gather(2, labels.unsqueeze(-1)).squeeze(-1)
48
- attention_mask = tokenized["attention_mask"][:, 1:]
49
- ll_per_sample = (ll_per_token * attention_mask).sum(dim=-1) / attention_mask.sum(dim=1).clamp(min=1)
50
-
51
- neg_entropy = (log_probs.exp() * log_probs)
52
- entropy_per_sample = -(neg_entropy.sum(dim=-1) * attention_mask).sum(-1) / attention_mask.sum(dim=1).clamp(min=1)
53
-
54
- scores = (entropy_per_sample + ll_per_sample).cpu().tolist()
55
-
56
- return scores if len(scores) > 1 else scores[0]
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import torch.nn.functional as F
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ import os
5
+
6
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
7
 
8
  class CustomDetector:
9
  def __init__(self, model_name="tiiuae/falcon-rw-1b", max_length=512, batch_size=80):
10
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
11
  self.model_name = model_name
12
  self.max_length = max_length
13
  self.batch_size = batch_size
14
 
15
+ try:
16
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
18
+ except Exception as e:
19
+ raise RuntimeError(f"Failed to load model {model_name}: {str(e)}")
20
 
 
 
21
  self.model.to(self.device)
22
  self.model.eval()
23
 
24
  if self.tokenizer.pad_token is None:
25
  self.tokenizer.pad_token = self.tokenizer.eos_token
26
 
27
+ def my_detector(self, texts: list[str]) -> list[float]:
28
  if isinstance(texts, str):
29
  texts = [texts]
30
 
31
+ try:
32
+ with torch.no_grad():
33
+ tokenized = self.tokenizer(
34
+ texts,
35
+ truncation=True,
36
+ padding=True,
37
+ max_length=self.max_length,
38
+ return_tensors="pt",
39
+ )
40
+ tokenized = {k: v.to(self.device) for k, v in tokenized.items()}
41
+ input_ids = tokenized["input_ids"]
42
+ attention_mask = tokenized["attention_mask"]
43
+
44
+ outputs = self.model(**tokenized)
45
+ logits = outputs.logits[:, :-1, :]
46
+ labels = tokenized["input_ids"][:, 1:]
47
+
48
+ log_probs = F.log_softmax(logits, dim=-1)
49
+ ll_per_token = log_probs.gather(2, labels.unsqueeze(-1)).squeeze(-1)
50
+ attention_mask = tokenized["attention_mask"][:, 1:]
51
+ ll_per_sample = (ll_per_token * attention_mask).sum(dim=-1) / attention_mask.sum(dim=1).clamp(min=1)
52
+
53
+ neg_entropy = (log_probs.exp() * log_probs)
54
+ entropy_per_sample = -(neg_entropy.sum(dim=-1) * attention_mask).sum(-1) / attention_mask.sum(dim=1).clamp(min=1)
55
+
56
+ scores = (entropy_per_sample + ll_per_sample).cpu().tolist()
57
+
58
+ return scores
59
+ except Exception as e:
60
+ raise RuntimeError(f"Error computing score: {str(e)}")
61
+
62
+ def batch_gpu_detector(self, all_texts):
63
+ results = []
64
+ for i in range(0, len(all_texts), self.batch_size):
65
+ batch_texts = all_texts[i:i + self.batch_size]
66
+ results.extend(self.my_detector(batch_texts))
67
+ return results