affiliation-clustering-0.3b / generate_embeddings.py

Add script for generating embeddings

ff2449f verified 4 months ago

11.7 kB

	import os
	import sys
	import argparse
	import logging
	from pathlib import Path
	from typing import List, Dict, Any, Optional
	import warnings

	import torch
	import torch.nn.functional as F
	import pandas as pd
	import numpy as np
	from tqdm import tqdm
	from datasets import Dataset, DatasetDict
	from transformers import AutoModel, AutoTokenizer

	warnings.filterwarnings('ignore')

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler('embedding_generation.log'),
	logging.StreamHandler()
	]
	)
	logger = logging.getLogger(__name__)


	class AffiliationEmbedder:
	def __init__(
	self,
	model_path: str = "./affiliation-clustering-0.3b",
	device: str = None,
	batch_size: int = 32,
	max_length: int = 512,
	use_fp16: bool = False
	):
	self.model_path = model_path
	self.batch_size = batch_size
	self.max_length = max_length
	self.use_fp16 = use_fp16

	if device is None:
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	else:
	self.device = torch.device(device)

	logger.info(f"Using device: {self.device}")
	if self.device.type == 'cuda':
	logger.info(f"GPU: {torch.cuda.get_device_name()}")
	logger.info(f"Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

	self._load_model()

	def _load_model(self):
	logger.info(f"Loading model from {self.model_path}")

	try:
	self.tokenizer = AutoTokenizer.from_pretrained(
	self.model_path,
	trust_remote_code=True
	)

	self.model = AutoModel.from_pretrained(
	self.model_path,
	trust_remote_code=True
	)

	self.model = self.model.to(self.device)

	if self.use_fp16 and self.device.type == 'cuda':
	self.model = self.model.half()
	logger.info("Using FP16 mixed precision")

	self.model.eval()

	logger.info("Model loaded successfully")

	except Exception as e:
	logger.error(f"Failed to load model: {e}")
	raise

	def encode_batch(self, texts: List[str]) -> np.ndarray:
	encoded = self.tokenizer(
	texts,
	padding=True,
	truncation=True,
	max_length=self.max_length,
	return_tensors='pt'
	)

	encoded = {k: v.to(self.device) for k, v in encoded.items()}

	with torch.no_grad():
	outputs = self.model(**encoded)

	if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
	embeddings = outputs.pooler_output
	else:
	token_embeddings = outputs.last_hidden_state
	attention_mask = encoded['attention_mask'].unsqueeze(-1)
	masked_embeddings = token_embeddings * attention_mask
	embeddings = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1)

	embeddings = F.normalize(embeddings, p=2, dim=1)

	embeddings = embeddings.cpu().numpy()

	if self.use_fp16:
	embeddings = embeddings.astype(np.float32)

	return embeddings

	def process_dataset(
	self,
	data_path: str,
	output_path: str,
	checkpoint_interval: int = 1000
	) -> None:

	logger.info(f"Processing dataset: {data_path}")

	df = pd.read_parquet(data_path)
	logger.info(f"Loaded {len(df)} samples")

	checkpoint_path = output_path.replace('.parquet', '_checkpoint.parquet')
	start_idx = 0

	if os.path.exists(checkpoint_path):
	logger.info(f"Found checkpoint at {checkpoint_path}")
	checkpoint_df = pd.read_parquet(checkpoint_path)
	start_idx = len(checkpoint_df)
	logger.info(f"Resuming from index {start_idx}")

	all_embeddings = []
	processed_rows = []

	total_batches = (len(df) - start_idx + self.batch_size - 1) // self.batch_size

	with tqdm(total=total_batches, desc="Generating embeddings") as pbar:
	for i in range(start_idx, len(df), self.batch_size):
	batch_df = df.iloc[i:i+self.batch_size]
	texts = batch_df['affiliation_name'].tolist()

	try:
	batch_embeddings = self.encode_batch(texts)

	for j, embedding in enumerate(batch_embeddings):
	row_idx = i + j
	row_data = df.iloc[row_idx].to_dict()
	row_data['embedding'] = embedding
	processed_rows.append(row_data)

	if len(processed_rows) % checkpoint_interval == 0:
	self._save_checkpoint(processed_rows, checkpoint_path)
	logger.info(f"Checkpoint saved at {len(processed_rows)} samples")

	pbar.update(1)

	except Exception as e:
	logger.error(f"Error processing batch at index {i}: {e}")
	if processed_rows:
	self._save_checkpoint(processed_rows, checkpoint_path)
	raise

	result_df = pd.DataFrame(processed_rows)

	logger.info(f"Saving embeddings to {output_path}")
	result_df.to_parquet(output_path, compression='snappy')

	if os.path.exists(checkpoint_path):
	os.remove(checkpoint_path)
	logger.info("Checkpoint file removed")

	logger.info(f"Successfully generated embeddings for {len(result_df)} samples")

	embedding_dim = len(result_df['embedding'].iloc[0])
	logger.info(f"Embedding dimension: {embedding_dim}")
	logger.info(f"Output file size: {os.path.getsize(output_path) / 1e6:.2f} MB")

	def _save_checkpoint(self, processed_rows: List[Dict], checkpoint_path: str):
	checkpoint_df = pd.DataFrame(processed_rows)
	checkpoint_df.to_parquet(checkpoint_path, compression='snappy')


	def main():
	parser = argparse.ArgumentParser(
	description="Generate embeddings for affiliation strings"
	)
	parser.add_argument(
	"--model-path",
	type=str,
	default="./affiliation-clustering-0.3b",
	help="Path to the pre-trained model directory"
	)
	parser.add_argument(
	"--data-dir",
	type=str,
	default="./20250727-unique-openalex-affiliations-w-ror-ids-top-1K-ror-ids-100-per-sample",
	help="Directory containing the input parquet files"
	)
	parser.add_argument(
	"--output-dir",
	type=str,
	default="./20250727-unique-openalex-affiliations-w-ror-ids-top-1K-ror-ids-100-per-sample-embeddings",
	help="Directory to save the output embeddings"
	)
	parser.add_argument(
	"--batch-size",
	type=int,
	default=32,
	help="Batch size for processing"
	)
	parser.add_argument(
	"--max-length",
	type=int,
	default=512,
	help="Maximum sequence length for tokenization"
	)
	parser.add_argument(
	"--device",
	type=str,
	default=None,
	help="Device to use (cuda/cpu, auto-detect if not specified)"
	)
	parser.add_argument(
	"--use-fp16",
	action="store_true",
	help="Use FP16 mixed precision for faster processing"
	)
	parser.add_argument(
	"--checkpoint-interval",
	type=int,
	default=1000,
	help="Save checkpoint every N batches"
	)
	parser.add_argument(
	"--push-to-hub",
	action="store_true",
	help="Push the resulting dataset to Hugging Face Hub"
	)
	parser.add_argument(
	"--hub-dataset-id",
	type=str,
	default=None,
	help="Hugging Face Hub dataset ID (required if push-to-hub is set)"
	)

	args = parser.parse_args()

	output_dir = Path(args.output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	embedder = AffiliationEmbedder(
	model_path=args.model_path,
	device=args.device,
	batch_size=args.batch_size,
	max_length=args.max_length,
	use_fp16=args.use_fp16
	)

	data_dir = Path(args.data_dir)
	train_file = list(data_dir.glob("*_train.parquet"))[0]
	test_file = list(data_dir.glob("*_test.parquet"))[0]

	train_output = output_dir / "train_embeddings.parquet"
	test_output = output_dir / "test_embeddings.parquet"

	logger.info("Processing training dataset...")
	embedder.process_dataset(
	str(train_file),
	str(train_output),
	checkpoint_interval=args.checkpoint_interval
	)

	logger.info("Processing test dataset...")
	embedder.process_dataset(
	str(test_file),
	str(test_output),
	checkpoint_interval=args.checkpoint_interval
	)

	if args.push_to_hub:
	if not args.hub_dataset_id:
	logger.error("--hub-dataset-id is required when --push-to-hub is set")
	sys.exit(1)

	logger.info(f"Pushing dataset to Hugging Face Hub: {args.hub_dataset_id}")

	try:
	from huggingface_hub import HfApi, login

	token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
	if token:
	login(token=token)
	logger.info("Authenticated with Hugging Face Hub using token")
	else:
	logger.info("No HF token found in environment, attempting to use existing credentials")

	logger.info("Loading generated embeddings...")
	train_df = pd.read_parquet(train_output)
	test_df = pd.read_parquet(test_output)

	logger.info(f"Train dataset: {len(train_df)} samples")
	logger.info(f"Test dataset: {len(test_df)} samples")

	logger.info("Creating dataset dictionary...")
	dataset_dict = DatasetDict({
	'train': Dataset.from_pandas(train_df),
	'test': Dataset.from_pandas(test_df)
	})

	logger.info(f"Pushing to hub: {args.hub_dataset_id}")
	dataset_dict.push_to_hub(
	args.hub_dataset_id,
	private=False,
	commit_message="Add affiliation embeddings generated with affiliation-clustering-0.3b model"
	)
	logger.info(f"Dataset successfully pushed to {args.hub_dataset_id}")
	logger.info(f"View at: https://huggingface.co/datasets/{args.hub_dataset_id}")

	except ImportError as e:
	logger.error(f"Failed to import required libraries: {e}")
	logger.error("Make sure huggingface_hub and datasets are installed")
	sys.exit(1)
	except Exception as e:
	logger.error(f"Failed to push dataset to hub: {e}")
	logger.error(f"Error type: {type(e).__name__}")
	import traceback
	logger.error(f"Traceback: {traceback.format_exc()}")
	sys.exit(1)

	logger.info("Embedding generation completed successfully!")


	if __name__ == "__main__":
	main()