Upload Spanish PII detection model OpenMed-PII-Spanish-SuperClinical-Large-434M-v1
Browse files
README.md
CHANGED
|
@@ -219,6 +219,21 @@ for entity in entities:
|
|
| 219 |
print(f"{entity['entity_group']}: {entity['word']} (score: {entity['score']:.3f})")
|
| 220 |
```
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
### De-identification Example
|
| 223 |
|
| 224 |
```python
|
|
|
|
| 219 |
print(f"{entity['entity_group']}: {entity['word']} (score: {entity['score']:.3f})")
|
| 220 |
```
|
| 221 |
|
| 222 |
+
> **Important — Accent Handling:** This model was trained on text without diacritical marks (accents). For best results, strip accents from your input before inference. Character offsets are preserved, so you can map entities back to the original text.
|
| 223 |
+
>
|
| 224 |
+
> ```python
|
| 225 |
+
> import unicodedata
|
| 226 |
+
>
|
| 227 |
+
> def strip_accents(text: str) -> str:
|
| 228 |
+
> nfc = unicodedata.normalize("NFC", text)
|
| 229 |
+
> nfd = unicodedata.normalize("NFD", nfc)
|
| 230 |
+
> stripped = "".join(ch for ch in nfd if unicodedata.category(ch) != "Mn")
|
| 231 |
+
> return unicodedata.normalize("NFC", stripped)
|
| 232 |
+
>
|
| 233 |
+
> text = strip_accents(text) # call before passing to the pipeline
|
| 234 |
+
> entities = ner(text)
|
| 235 |
+
> ```
|
| 236 |
+
|
| 237 |
### De-identification Example
|
| 238 |
|
| 239 |
```python
|