File size: 5,588 Bytes
defedca
 
140a80a
defedca
 
 
 
 
 
 
 
140a80a
defedca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140a80a
defedca
 
 
 
 
140a80a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
defedca
140a80a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
defedca
140a80a
 
 
 
defedca
140a80a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
TensorRT Inference Example for WayraPPL
Requires A100 GPU with TensorRT 10.13+
"""

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from transformers import AutoTokenizer
import torch
import time

class WayraPPLTensorRT:
    def __init__(self, engine_path: str):
        # Load TensorRT engine
        trt_logger = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(trt_logger)
        
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        
        self.engine = runtime.deserialize_cuda_engine(engine_data)
        self.context = self.engine.create_execution_context()
        self.stream = cuda.Stream()
    
    def infer(self, input_ids: np.ndarray, attention_mask: np.ndarray):
        batch_size, seq_len = input_ids.shape
        
        # Set dynamic shapes
        self.context.set_input_shape("input_ids", input_ids.shape)
        self.context.set_input_shape("attention_mask", attention_mask.shape)
        
        # Allocate memory
        d_input_ids = cuda.mem_alloc(input_ids.nbytes)
        d_attention_mask = cuda.mem_alloc(attention_mask.nbytes)
        
        # Copy inputs
        cuda.memcpy_htod_async(d_input_ids, input_ids.astype(np.int64), self.stream)
        cuda.memcpy_htod_async(d_attention_mask, attention_mask.astype(np.int64), self.stream)
        
        # Setup outputs
        outputs = {}
        device_outputs = {}
        
        for i in range(self.engine.num_io_tensors):
            tensor_name = self.engine.get_tensor_name(i)
            if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.OUTPUT:
                output_shape = self.context.get_tensor_shape(tensor_name)
                if output_shape[0] == -1:
                    output_shape = (batch_size,) + output_shape[1:]
                
                host_output = np.empty(output_shape, dtype=np.float32)
                device_output = cuda.mem_alloc(host_output.nbytes)
                
                outputs[tensor_name] = host_output
                device_outputs[tensor_name] = device_output
                self.context.set_tensor_address(tensor_name, int(device_output))
        
        # Set input addresses
        self.context.set_tensor_address("input_ids", int(d_input_ids))
        self.context.set_tensor_address("attention_mask", int(d_attention_mask))
        
        # Execute
        self.context.execute_async_v3(stream_handle=self.stream.handle)
        
        # Copy outputs
        for tensor_name, host_output in outputs.items():
            cuda.memcpy_dtoh_async(host_output, device_outputs[tensor_name], self.stream)
        
        self.stream.synchronize()
        
        # Cleanup
        d_input_ids.free()
        d_attention_mask.free()
        for device_output in device_outputs.values():
            device_output.free()
        
        return outputs

# Usage examples with multilingual text
if __name__ == "__main__":
    # Load model
    model = WayraPPLTensorRT("wayrappl_fp16_bs2048.engine")
    tokenizer = AutoTokenizer.from_pretrained(".")
    
    # Multilingual examples
    texts = [
        # Spanish
        "La inteligencia artificial está transformando el mundo de manera profunda y acelerada.",
        "El análisis de datos permite descubrir patrones ocultos en grandes volúmenes de información.",
        
        # Portuguese  
        "A tecnologia blockchain promete revolucionar sistemas financeiros tradicionais.",
        "Machine learning possibilita a automação de processos complexos em diversas indústrias.",
        
        # English
        "Natural language processing enables computers to understand human communication.",
        "Deep learning algorithms require massive computational resources for training."
    ]
    
    # Single inference
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.infer(inputs['input_ids'].numpy(), inputs['attention_mask'].numpy())
    
    print("Perplexity scores:")
    for i, text in enumerate(texts):
        print(f"{text[:50]}... -> PPL: {outputs['ppl'][i]:.2f}")
    
    # Performance comparison: 100K examples
    print("
" + "="*50)
    print("PERFORMANCE COMPARISON: 100K Examples")
    print("="*50)
    
    # Generate 100K examples
    large_texts = texts * 16667  # ~100K examples
    
    # TensorRT benchmark
    start_time = time.time()
    batch_size = 2048
    total_processed = 0
    
    for i in range(0, len(large_texts), batch_size):
        batch = large_texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model.infer(inputs['input_ids'].numpy(), inputs['attention_mask'].numpy())
        total_processed += len(batch)
    
    tensorrt_time = time.time() - start_time
    tensorrt_throughput = total_processed / tensorrt_time
    
    print(f"TensorRT Results:")
    print(f"  Time: {tensorrt_time:.2f} hours")
    print(f"  Throughput: {tensorrt_throughput:.0f} samples/sec")
    print(f"  Total processed: {total_processed:,} examples")
    
    # Estimated PyTorch comparison
    pytorch_throughput = 1000  # samples/sec (estimated)
    pytorch_time = total_processed / pytorch_throughput / 3600  # hours
    
    print(f"
PyTorch Estimated:")
    print(f"  Time: {pytorch_time:.2f} hours") 
    print(f"  Throughput: {pytorch_throughput} samples/sec")
    
    speedup = pytorch_time / tensorrt_time
    print(f"
Speedup: {speedup:.1f}x faster with TensorRT")