Spaces:
Runtime error
Runtime error
Upload 13 files
Browse files- lib/.ipynb_checkpoints/mlutil-checkpoint.ipynb +1297 -0
- lib/.ipynb_checkpoints/sampler-checkpoint.ipynb +6 -0
- lib/.ipynb_checkpoints/stats-checkpoint.ipynb +510 -0
- lib/.ipynb_checkpoints/tnn-checkpoint.ipynb +800 -0
- lib/.ipynb_checkpoints/txproc-checkpoint.ipynb +1002 -0
- lib/.ipynb_checkpoints/util-checkpoint.ipynb +2141 -0
- lib/mlutil.ipynb +1297 -0
- lib/sampler.ipynb +1366 -0
- lib/stats.ipynb +510 -0
- lib/tnn.ipynb +800 -0
- lib/txproc.ipynb +1002 -0
- lib/util.ipynb +2141 -0
- model/tnn/pdamb.mod +0 -0
lib/.ipynb_checkpoints/mlutil-checkpoint.ipynb
ADDED
|
@@ -0,0 +1,1297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "2d05ce02",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"import numpy as np\n",
|
| 13 |
+
"from sklearn import preprocessing\n",
|
| 14 |
+
"from sklearn import metrics\n",
|
| 15 |
+
"from sklearn.datasets import make_blobs\n",
|
| 16 |
+
"from sklearn.datasets import make_classification\n",
|
| 17 |
+
"import random\n",
|
| 18 |
+
"from math import *\n",
|
| 19 |
+
"from decimal import Decimal\n",
|
| 20 |
+
"import statistics\n",
|
| 21 |
+
"import jprops\n",
|
| 22 |
+
"from Levenshtein import distance as ld\n",
|
| 23 |
+
"from util import *\n",
|
| 24 |
+
"from sampler import *\n",
|
| 25 |
+
"\n",
|
| 26 |
+
"class Configuration:\n",
|
| 27 |
+
" \"\"\"\n",
|
| 28 |
+
" Configuration management. Supports default value, mandatory value and typed value.\n",
|
| 29 |
+
" \"\"\"\n",
|
| 30 |
+
" def __init__(self, configFile, defValues, verbose=False):\n",
|
| 31 |
+
" \"\"\"\n",
|
| 32 |
+
" initializer\n",
|
| 33 |
+
"\n",
|
| 34 |
+
" Parameters\n",
|
| 35 |
+
" configFile : config file path\n",
|
| 36 |
+
" defValues : dictionary of default values\n",
|
| 37 |
+
" verbose : verbosity flag\n",
|
| 38 |
+
" \"\"\"\n",
|
| 39 |
+
" configs = {}\n",
|
| 40 |
+
" with open(configFile) as fp:\n",
|
| 41 |
+
" for key, value in jprops.iter_properties(fp):\n",
|
| 42 |
+
" configs[key] = value\n",
|
| 43 |
+
" self.configs = configs\n",
|
| 44 |
+
" self.defValues = defValues\n",
|
| 45 |
+
" self.verbose = verbose\n",
|
| 46 |
+
"\n",
|
| 47 |
+
" def override(self, configFile):\n",
|
| 48 |
+
" \"\"\"\n",
|
| 49 |
+
" over ride configuration from file\n",
|
| 50 |
+
"\n",
|
| 51 |
+
" Parameters\n",
|
| 52 |
+
" configFile : override config file path\n",
|
| 53 |
+
" \"\"\"\n",
|
| 54 |
+
" with open(configFile) as fp:\n",
|
| 55 |
+
" for key, value in jprops.iter_properties(fp):\n",
|
| 56 |
+
" self.configs[key] = value\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"\n",
|
| 59 |
+
" def setParam(self, name, value):\n",
|
| 60 |
+
" \"\"\"\n",
|
| 61 |
+
" override individual configuration\n",
|
| 62 |
+
" Parameters\n",
|
| 63 |
+
" name : config param name\n",
|
| 64 |
+
" value : config param value\n",
|
| 65 |
+
" \"\"\"\n",
|
| 66 |
+
" self.configs[name] = value\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"\n",
|
| 69 |
+
" def getStringConfig(self, name):\n",
|
| 70 |
+
" \"\"\"\n",
|
| 71 |
+
" get string param\n",
|
| 72 |
+
" Parameters\n",
|
| 73 |
+
" name : config param name\n",
|
| 74 |
+
" \"\"\"\n",
|
| 75 |
+
" if self.isNone(name):\n",
|
| 76 |
+
" val = (None, False)\n",
|
| 77 |
+
" elif self.isDefault(name):\n",
|
| 78 |
+
" val = (self.handleDefault(name), True)\n",
|
| 79 |
+
" else:\n",
|
| 80 |
+
" val = (self.configs[name], False)\n",
|
| 81 |
+
" if self.verbose:\n",
|
| 82 |
+
" print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
|
| 83 |
+
" return val\n",
|
| 84 |
+
"\n",
|
| 85 |
+
"\n",
|
| 86 |
+
" def getIntConfig(self, name):\n",
|
| 87 |
+
" \"\"\"\n",
|
| 88 |
+
" get int param\n",
|
| 89 |
+
" Parameters\n",
|
| 90 |
+
" name : config param name\n",
|
| 91 |
+
" \"\"\"\n",
|
| 92 |
+
" #print \"%s %s\" %(name,self.configs[name])\n",
|
| 93 |
+
" if self.isNone(name):\n",
|
| 94 |
+
" val = (None, False)\n",
|
| 95 |
+
" elif self.isDefault(name):\n",
|
| 96 |
+
" val = (self.handleDefault(name), True)\n",
|
| 97 |
+
" else:\n",
|
| 98 |
+
" val = (int(self.configs[name]), False)\n",
|
| 99 |
+
" if self.verbose:\n",
|
| 100 |
+
" print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
|
| 101 |
+
" return val\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"\n",
|
| 104 |
+
" def getFloatConfig(self, name):\n",
|
| 105 |
+
" \"\"\"\n",
|
| 106 |
+
" get float param\n",
|
| 107 |
+
" Parameters\n",
|
| 108 |
+
" name : config param name\n",
|
| 109 |
+
" \"\"\"\n",
|
| 110 |
+
" #print \"%s %s\" %(name,self.configs[name])\n",
|
| 111 |
+
" if self.isNone(name):\n",
|
| 112 |
+
" val = (None, False)\n",
|
| 113 |
+
" elif self.isDefault(name):\n",
|
| 114 |
+
" val = (self.handleDefault(name), True)\n",
|
| 115 |
+
" else:\n",
|
| 116 |
+
" val = (float(self.configs[name]), False)\n",
|
| 117 |
+
" if self.verbose:\n",
|
| 118 |
+
" print( \"{} {} {:06.3f}\".format(name, self.configs[name], val[0]))\n",
|
| 119 |
+
" return val\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"\n",
|
| 122 |
+
" def getBooleanConfig(self, name):\n",
|
| 123 |
+
" \"\"\"\n",
|
| 124 |
+
" #get boolean param\n",
|
| 125 |
+
" Parameters\n",
|
| 126 |
+
" name : config param name\n",
|
| 127 |
+
" \"\"\"\n",
|
| 128 |
+
" if self.isNone(name):\n",
|
| 129 |
+
" val = (None, False)\n",
|
| 130 |
+
" elif self.isDefault(name):\n",
|
| 131 |
+
" val = (self.handleDefault(name), True)\n",
|
| 132 |
+
" else:\n",
|
| 133 |
+
" bVal = self.configs[name].lower() == \"true\"\n",
|
| 134 |
+
" val = (bVal, False)\n",
|
| 135 |
+
" if self.verbose:\n",
|
| 136 |
+
" print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
|
| 137 |
+
" return val\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"\n",
|
| 140 |
+
" def getIntListConfig(self, name, delim=\",\"):\n",
|
| 141 |
+
" \"\"\"\n",
|
| 142 |
+
" get int list param\n",
|
| 143 |
+
" Parameters\n",
|
| 144 |
+
" name : config param name\n",
|
| 145 |
+
" delim : delemeter\n",
|
| 146 |
+
" \"\"\"\n",
|
| 147 |
+
" if self.isNone(name):\n",
|
| 148 |
+
" val = (None, False)\n",
|
| 149 |
+
" elif self.isDefault(name):\n",
|
| 150 |
+
" val = (self.handleDefault(name), True)\n",
|
| 151 |
+
" else:\n",
|
| 152 |
+
" delSepStr = self.getStringConfig(name)\n",
|
| 153 |
+
"\n",
|
| 154 |
+
" #specified as range\n",
|
| 155 |
+
" intList = strListOrRangeToIntArray(delSepStr[0])\n",
|
| 156 |
+
" val =(intList, delSepStr[1])\n",
|
| 157 |
+
" return val\n",
|
| 158 |
+
"\n",
|
| 159 |
+
" def getFloatListConfig(self, name, delim=\",\"):\n",
|
| 160 |
+
" \"\"\"\n",
|
| 161 |
+
" get float list param\n",
|
| 162 |
+
" Parameters\n",
|
| 163 |
+
" name : config param name\n",
|
| 164 |
+
" delim : delemeter\n",
|
| 165 |
+
" \"\"\"\n",
|
| 166 |
+
" delSepStr = self.getStringConfig(name)\n",
|
| 167 |
+
" if self.isNone(name):\n",
|
| 168 |
+
" val = (None, False)\n",
|
| 169 |
+
" elif self.isDefault(name):\n",
|
| 170 |
+
" val = (self.handleDefault(name), True)\n",
|
| 171 |
+
" else:\n",
|
| 172 |
+
" flList = strToFloatArray(delSepStr[0], delim)\n",
|
| 173 |
+
" val =(flList, delSepStr[1])\n",
|
| 174 |
+
" return val\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"\n",
|
| 177 |
+
" def getStringListConfig(self, name, delim=\",\"):\n",
|
| 178 |
+
" \"\"\"\n",
|
| 179 |
+
" get string list param\n",
|
| 180 |
+
" Parameters\n",
|
| 181 |
+
" name : config param name\n",
|
| 182 |
+
" delim : delemeter\n",
|
| 183 |
+
" \"\"\"\n",
|
| 184 |
+
" delSepStr = self.getStringConfig(name)\n",
|
| 185 |
+
" if self.isNone(name):\n",
|
| 186 |
+
" val = (None, False)\n",
|
| 187 |
+
" elif self.isDefault(name):\n",
|
| 188 |
+
" val = (self.handleDefault(name), True)\n",
|
| 189 |
+
" else:\n",
|
| 190 |
+
" strList = delSepStr[0].split(delim)\n",
|
| 191 |
+
" val = (strList, delSepStr[1])\n",
|
| 192 |
+
" return val\n",
|
| 193 |
+
"\n",
|
| 194 |
+
" def handleDefault(self, name):\n",
|
| 195 |
+
" \"\"\"\n",
|
| 196 |
+
" handles default\n",
|
| 197 |
+
" Parameters\n",
|
| 198 |
+
" name : config param name\n",
|
| 199 |
+
" \"\"\"\n",
|
| 200 |
+
" dVal = self.defValues[name]\n",
|
| 201 |
+
" if (dVal[1] is None):\n",
|
| 202 |
+
" val = dVal[0]\n",
|
| 203 |
+
" else:\n",
|
| 204 |
+
" raise ValueError(dVal[1])\n",
|
| 205 |
+
" return val\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"\n",
|
| 208 |
+
" def isNone(self, name):\n",
|
| 209 |
+
" \"\"\"\n",
|
| 210 |
+
" true is value is None\t\n",
|
| 211 |
+
" Parameters\n",
|
| 212 |
+
" name : config param name\n",
|
| 213 |
+
" \"\"\"\n",
|
| 214 |
+
" return self.configs[name].lower() == \"none\"\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"\n",
|
| 217 |
+
" def isDefault(self, name):\n",
|
| 218 |
+
" \"\"\"\n",
|
| 219 |
+
" true if the value is default\t\n",
|
| 220 |
+
" Parameters\n",
|
| 221 |
+
" name : config param name\n",
|
| 222 |
+
" \"\"\"\n",
|
| 223 |
+
" de = self.configs[name] == \"_\"\n",
|
| 224 |
+
" #print de\n",
|
| 225 |
+
" return de\n",
|
| 226 |
+
"\n",
|
| 227 |
+
"\n",
|
| 228 |
+
" def eitherOrStringConfig(self, firstName, secondName):\n",
|
| 229 |
+
" \"\"\"\n",
|
| 230 |
+
" returns one of two string parameters\t\n",
|
| 231 |
+
" Parameters\n",
|
| 232 |
+
" firstName : first parameter name\n",
|
| 233 |
+
" secondName : second parameter name\t\n",
|
| 234 |
+
" \"\"\"\n",
|
| 235 |
+
" if not self.isNone(firstName):\n",
|
| 236 |
+
" first = self.getStringConfig(firstName)[0]\n",
|
| 237 |
+
" second = None\n",
|
| 238 |
+
" if not self.isNone(secondName):\n",
|
| 239 |
+
" raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n",
|
| 240 |
+
" else:\n",
|
| 241 |
+
" if not self.isNone(secondName):\n",
|
| 242 |
+
" second = self.getStringConfig(secondtName)[0]\n",
|
| 243 |
+
" first = None\n",
|
| 244 |
+
" else:\n",
|
| 245 |
+
" raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n",
|
| 246 |
+
" return (first, second)\n",
|
| 247 |
+
"\n",
|
| 248 |
+
"\n",
|
| 249 |
+
" def eitherOrIntConfig(self, firstName, secondName):\n",
|
| 250 |
+
" \"\"\"\n",
|
| 251 |
+
" returns one of two int parameters\t\n",
|
| 252 |
+
" Parameters\n",
|
| 253 |
+
" firstName : first parameter name\n",
|
| 254 |
+
" secondName : second parameter name\t\n",
|
| 255 |
+
" \"\"\"\n",
|
| 256 |
+
" if not self.isNone(firstName):\n",
|
| 257 |
+
" first = self.getIntConfig(firstName)[0]\n",
|
| 258 |
+
" second = None\n",
|
| 259 |
+
" if not self.isNone(secondName):\n",
|
| 260 |
+
" raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n",
|
| 261 |
+
" else:\n",
|
| 262 |
+
" if not self.isNone(secondName):\n",
|
| 263 |
+
" second = self.getIntConfig(secondsName)[0]\n",
|
| 264 |
+
" first = None\n",
|
| 265 |
+
" else:\n",
|
| 266 |
+
" raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n",
|
| 267 |
+
" return (first, second)\n",
|
| 268 |
+
"\n",
|
| 269 |
+
"\n",
|
| 270 |
+
"class CatLabelGenerator:\n",
|
| 271 |
+
" \"\"\"\n",
|
| 272 |
+
" label generator for categorical variables\n",
|
| 273 |
+
" \"\"\"\n",
|
| 274 |
+
" def __init__(self, catValues, delim):\n",
|
| 275 |
+
" \"\"\"\n",
|
| 276 |
+
" initilizers\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" Parameters\n",
|
| 279 |
+
" catValues : dictionary of categorical values\n",
|
| 280 |
+
" delim : delemeter\n",
|
| 281 |
+
" \"\"\"\n",
|
| 282 |
+
" self.encoders = {}\n",
|
| 283 |
+
" self.catValues = catValues\n",
|
| 284 |
+
" self.delim = delim\n",
|
| 285 |
+
" for k in self.catValues.keys():\t\n",
|
| 286 |
+
" le = preprocessing.LabelEncoder()\t\n",
|
| 287 |
+
" le.fit(self.catValues[k])\n",
|
| 288 |
+
" self.encoders[k] = le\n",
|
| 289 |
+
"\n",
|
| 290 |
+
" def processRow(self, row):\t\n",
|
| 291 |
+
" \"\"\"\n",
|
| 292 |
+
" encode row categorical values\n",
|
| 293 |
+
"\n",
|
| 294 |
+
" Parameters:\n",
|
| 295 |
+
" row : data row\n",
|
| 296 |
+
" \"\"\"\n",
|
| 297 |
+
" #print row\n",
|
| 298 |
+
" rowArr = row.split(self.delim)\n",
|
| 299 |
+
" for i in range(len(rowArr)):\n",
|
| 300 |
+
" if (i in self.catValues):\n",
|
| 301 |
+
" curVal = rowArr[i]\n",
|
| 302 |
+
" assert curVal in self.catValues[i], \"categorival value invalid\"\n",
|
| 303 |
+
" encVal = self.encoders[i].transform([curVal])\n",
|
| 304 |
+
" rowArr[i] = str(encVal[0])\n",
|
| 305 |
+
" return self.delim.join(rowArr)\t\t\n",
|
| 306 |
+
"\n",
|
| 307 |
+
" def getOrigLabels(self, indx):\n",
|
| 308 |
+
" \"\"\"\n",
|
| 309 |
+
" get original labels\n",
|
| 310 |
+
"\n",
|
| 311 |
+
" Parameters:\n",
|
| 312 |
+
" indx : column index\n",
|
| 313 |
+
" \"\"\"\n",
|
| 314 |
+
" return self.encoders[indx].classes_\t\n",
|
| 315 |
+
"\n",
|
| 316 |
+
"\n",
|
| 317 |
+
"class SupvLearningDataGenerator:\n",
|
| 318 |
+
" \"\"\"\n",
|
| 319 |
+
" data generator for supervised learning\n",
|
| 320 |
+
" \"\"\"\n",
|
| 321 |
+
" def __init__(self, configFile):\n",
|
| 322 |
+
" \"\"\"\n",
|
| 323 |
+
" initilizers\n",
|
| 324 |
+
"\n",
|
| 325 |
+
" Parameters\n",
|
| 326 |
+
" configFile : config file path\n",
|
| 327 |
+
" \"\"\"\n",
|
| 328 |
+
" defValues = dict()\n",
|
| 329 |
+
" defValues[\"common.num.samp\"] = (100, None)\n",
|
| 330 |
+
" defValues[\"common.num.feat\"] = (5, None)\n",
|
| 331 |
+
" defValues[\"common.feat.trans\"] = (None, None)\n",
|
| 332 |
+
" defValues[\"common.feat.types\"] = (None, \"missing feature types\")\n",
|
| 333 |
+
" defValues[\"common.cat.feat.distr\"] = (None, None)\n",
|
| 334 |
+
" defValues[\"common.output.precision\"] = (3, None)\n",
|
| 335 |
+
" defValues[\"common.error\"] = (0.01, None)\n",
|
| 336 |
+
" defValues[\"class.gen.technique\"] = (\"blob\", None)\n",
|
| 337 |
+
" defValues[\"class.num.feat.informative\"] = (2, None)\n",
|
| 338 |
+
" defValues[\"class.num.feat.redundant\"] = (2, None)\n",
|
| 339 |
+
" defValues[\"class.num.feat.repeated\"] = (0, None)\n",
|
| 340 |
+
" defValues[\"class.num.feat.cat\"] = (0, None)\n",
|
| 341 |
+
" defValues[\"class.num.class\"] = (2, None)\n",
|
| 342 |
+
"\n",
|
| 343 |
+
" self.config = Configuration(configFile, defValues)\n",
|
| 344 |
+
"\n",
|
| 345 |
+
" def genClassifierData(self):\n",
|
| 346 |
+
" \"\"\"\n",
|
| 347 |
+
" generates classifier data\n",
|
| 348 |
+
" \"\"\"\n",
|
| 349 |
+
" nsamp = self.config.getIntConfig(\"common.num.samp\")[0]\n",
|
| 350 |
+
" nfeat = self.config.getIntConfig(\"common.num.feat\")[0]\n",
|
| 351 |
+
" nclass = self.config.getIntConfig(\"class.num.class\")[0]\n",
|
| 352 |
+
" #transform with shift and scale\n",
|
| 353 |
+
" ftrans = self.config.getFloatListConfig(\"common.feat.trans\")[0]\n",
|
| 354 |
+
" feTrans = dict()\n",
|
| 355 |
+
" for i in range(0, len(ftrans), 2):\n",
|
| 356 |
+
" tr = (ftrans[i], ftrans[i+1])\n",
|
| 357 |
+
" indx = int(i/2)\n",
|
| 358 |
+
" feTrans[indx] = tr\n",
|
| 359 |
+
"\n",
|
| 360 |
+
" ftypes = self.config.getStringListConfig(\"common.feat.types\")[0]\n",
|
| 361 |
+
"\n",
|
| 362 |
+
" # categorical feature distribution\n",
|
| 363 |
+
" feCatDist = dict()\n",
|
| 364 |
+
" fcatdl = self.config.getStringListConfig(\"common.cat.feat.distr\")[0]\n",
|
| 365 |
+
" for fcatds in fcatdl:\n",
|
| 366 |
+
" fcatd = fcatds.split(\":\")\n",
|
| 367 |
+
" feInd = int(fcatd[0])\n",
|
| 368 |
+
" clVal = int(fcatd[1])\n",
|
| 369 |
+
" key = (feInd, clVal)\t\t#feature index and class value\n",
|
| 370 |
+
" dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))\n",
|
| 371 |
+
" feCatDist[key] = CategoricalRejectSampler(*dist)\n",
|
| 372 |
+
"\n",
|
| 373 |
+
" #shift and scale\n",
|
| 374 |
+
" genTechnique = self.config.getStringConfig(\"class.gen.technique\")[0]\n",
|
| 375 |
+
" error = self.config.getFloatConfig(\"common.error\")[0]\n",
|
| 376 |
+
" if genTechnique == \"blob\":\n",
|
| 377 |
+
" features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)\n",
|
| 378 |
+
" for i in range(nsamp):\t\t\t#shift and scale\n",
|
| 379 |
+
" for j in range(nfeat):\n",
|
| 380 |
+
" tr = feTrans[j]\n",
|
| 381 |
+
" features[i,j] = (features[i,j] + tr[0]) * tr[1]\n",
|
| 382 |
+
" claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))\n",
|
| 383 |
+
" elif genTechnique == \"classify\":\n",
|
| 384 |
+
" nfeatInfo = self.config.getIntConfig(\"class.num.feat.informative\")[0]\n",
|
| 385 |
+
" nfeatRed = self.config.getIntConfig(\"class.num.feat.redundant\")[0]\n",
|
| 386 |
+
" nfeatRep = self.config.getIntConfig(\"class.num.feat.repeated\")[0]\n",
|
| 387 |
+
" shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))\n",
|
| 388 |
+
" scales = list(map(lambda i : feTrans[i][1], range(nfeat)))\n",
|
| 389 |
+
" features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed, \n",
|
| 390 |
+
" n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)\n",
|
| 391 |
+
" else:\n",
|
| 392 |
+
" raise \"invalid genaration technique\"\n",
|
| 393 |
+
"\n",
|
| 394 |
+
" # add categorical features and format\n",
|
| 395 |
+
" nCatFeat = self.config.getIntConfig(\"class.num.feat.cat\")[0]\n",
|
| 396 |
+
" prec = self.config.getIntConfig(\"common.output.precision\")[0]\n",
|
| 397 |
+
" for f , c in zip(features, claz):\n",
|
| 398 |
+
" nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))\n",
|
| 399 |
+
" if nCatFeat > 0:\n",
|
| 400 |
+
" cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))\n",
|
| 401 |
+
" rec = \",\".join(nfs) + \",\" + \",\".join(cfs) + \",\" + str(c)\n",
|
| 402 |
+
" else:\n",
|
| 403 |
+
" rec = \",\".join(nfs) + \",\" + str(c)\n",
|
| 404 |
+
" yield rec\n",
|
| 405 |
+
"\n",
|
| 406 |
+
" def numFeToStr(self, fv, ft, prec):\n",
|
| 407 |
+
" \"\"\"\n",
|
| 408 |
+
" nummeric feature value to string\n",
|
| 409 |
+
"\n",
|
| 410 |
+
" Parameters\n",
|
| 411 |
+
" fv : field value\n",
|
| 412 |
+
" ft : field data type\n",
|
| 413 |
+
" prec : precision\n",
|
| 414 |
+
" \"\"\"\n",
|
| 415 |
+
" if ft == \"float\":\n",
|
| 416 |
+
" s = formatFloat(prec, fv)\n",
|
| 417 |
+
" elif ft ==\"int\":\n",
|
| 418 |
+
" s = str(int(fv))\n",
|
| 419 |
+
" else:\t\t\n",
|
| 420 |
+
" raise \"invalid type expecting float or int\"\n",
|
| 421 |
+
" return s\n",
|
| 422 |
+
"\n",
|
| 423 |
+
" def catFe(self, i, cv, ft, feCatDist):\n",
|
| 424 |
+
" \"\"\"\n",
|
| 425 |
+
" generate categorical feature\n",
|
| 426 |
+
"\n",
|
| 427 |
+
" Parameters\n",
|
| 428 |
+
" i : col index\n",
|
| 429 |
+
" cv : class value\n",
|
| 430 |
+
" ft : field data type\n",
|
| 431 |
+
" feCatDist : cat value distribution\n",
|
| 432 |
+
" \"\"\"\n",
|
| 433 |
+
" if ft == \"cat\":\n",
|
| 434 |
+
" key = (i, cv)\n",
|
| 435 |
+
" s = feCatDist[key].sample()\n",
|
| 436 |
+
" else:\t\t\n",
|
| 437 |
+
" raise \"invalid type expecting categorical\"\n",
|
| 438 |
+
" return s\n",
|
| 439 |
+
"\n",
|
| 440 |
+
"\n",
|
| 441 |
+
"\n",
|
| 442 |
+
"def loadDataFile(file, delim, cols, colIndices):\n",
|
| 443 |
+
" \"\"\"\n",
|
| 444 |
+
" loads delim separated file and extracts columns\n",
|
| 445 |
+
" Parameters\n",
|
| 446 |
+
" file : file path\n",
|
| 447 |
+
" delim : delemeter\n",
|
| 448 |
+
" cols : columns to use from file\n",
|
| 449 |
+
" colIndices ; columns to extract\n",
|
| 450 |
+
" \"\"\"\n",
|
| 451 |
+
" data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
|
| 452 |
+
" extrData = data[:,colIndices]\n",
|
| 453 |
+
" return (data, extrData)\n",
|
| 454 |
+
"\n",
|
| 455 |
+
"def loadFeatDataFile(file, delim, cols):\n",
|
| 456 |
+
" \"\"\"\n",
|
| 457 |
+
" loads delim separated file and extracts columns\n",
|
| 458 |
+
"\n",
|
| 459 |
+
" Parameters\n",
|
| 460 |
+
" file : file path\n",
|
| 461 |
+
" delim : delemeter\n",
|
| 462 |
+
" cols : columns to use from file\n",
|
| 463 |
+
" \"\"\"\n",
|
| 464 |
+
" data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
|
| 465 |
+
" return data\n",
|
| 466 |
+
"\n",
|
| 467 |
+
"def extrColumns(arr, columns):\n",
|
| 468 |
+
" \"\"\"\n",
|
| 469 |
+
" extracts columns\n",
|
| 470 |
+
"\n",
|
| 471 |
+
" Parameters\n",
|
| 472 |
+
" arr : 2D array\n",
|
| 473 |
+
" columns : columns\n",
|
| 474 |
+
" \"\"\"\n",
|
| 475 |
+
" return arr[:, columns]\n",
|
| 476 |
+
"\n",
|
| 477 |
+
"def subSample(featData, clsData, subSampleRate, withReplacement):\n",
|
| 478 |
+
" \"\"\"\n",
|
| 479 |
+
" subsample feature and class label data\t\n",
|
| 480 |
+
" Parameters\n",
|
| 481 |
+
" featData : 2D array of feature data\n",
|
| 482 |
+
" clsData : arrray of class labels\n",
|
| 483 |
+
" subSampleRate : fraction to be sampled\n",
|
| 484 |
+
" withReplacement : true if sampling with replacement\n",
|
| 485 |
+
" \"\"\"\n",
|
| 486 |
+
" sampSize = int(featData.shape[0] * subSampleRate)\n",
|
| 487 |
+
" sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)\n",
|
| 488 |
+
" sampFeat = featData[sampledIndx]\n",
|
| 489 |
+
" sampCls = clsData[sampledIndx]\n",
|
| 490 |
+
" return(sampFeat, sampCls)\n",
|
| 491 |
+
"\n",
|
| 492 |
+
"def euclideanDistance(x,y):\n",
|
| 493 |
+
" \"\"\"\n",
|
| 494 |
+
" euclidean distance\n",
|
| 495 |
+
" Parameters\n",
|
| 496 |
+
" x : first vector\n",
|
| 497 |
+
" y : second fvector\n",
|
| 498 |
+
" \"\"\"\n",
|
| 499 |
+
" return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))\n",
|
| 500 |
+
"\n",
|
| 501 |
+
"def squareRooted(x):\n",
|
| 502 |
+
" \"\"\"\n",
|
| 503 |
+
" square root of sum square\n",
|
| 504 |
+
" Parameters\n",
|
| 505 |
+
" x : data vector\n",
|
| 506 |
+
" \"\"\"\n",
|
| 507 |
+
" return round(sqrt(sum([a*a for a in x])),3)\n",
|
| 508 |
+
"\n",
|
| 509 |
+
"def cosineSimilarity(x,y):\n",
|
| 510 |
+
" \"\"\"\n",
|
| 511 |
+
" cosine similarity\n",
|
| 512 |
+
"\n",
|
| 513 |
+
" Parameters\n",
|
| 514 |
+
" x : first vector\n",
|
| 515 |
+
" y : second fvector\n",
|
| 516 |
+
" \"\"\"\n",
|
| 517 |
+
" numerator = sum(a*b for a,b in zip(x,y))\n",
|
| 518 |
+
" denominator = squareRooted(x) * squareRooted(y)\n",
|
| 519 |
+
" return round(numerator / float(denominator), 3)\n",
|
| 520 |
+
"\n",
|
| 521 |
+
"def cosineDistance(x,y):\n",
|
| 522 |
+
" \"\"\"\n",
|
| 523 |
+
" cosine distance\n",
|
| 524 |
+
" Parameters\n",
|
| 525 |
+
" x : first vector\n",
|
| 526 |
+
" y : second fvector\n",
|
| 527 |
+
" \"\"\"\n",
|
| 528 |
+
" return 1.0 - cosineSimilarity(x,y)\n",
|
| 529 |
+
"\n",
|
| 530 |
+
"def manhattanDistance(x,y):\n",
|
| 531 |
+
" \"\"\"\n",
|
| 532 |
+
" manhattan distance\n",
|
| 533 |
+
" Parameters\n",
|
| 534 |
+
" x : first vector\n",
|
| 535 |
+
" y : second fvector\n",
|
| 536 |
+
" \"\"\"\n",
|
| 537 |
+
" return sum(abs(a-b) for a,b in zip(x,y))\n",
|
| 538 |
+
"\n",
|
| 539 |
+
"def nthRoot(value, nRoot):\n",
|
| 540 |
+
" \"\"\"\n",
|
| 541 |
+
" nth root\n",
|
| 542 |
+
" Parameters\n",
|
| 543 |
+
" value : data value\n",
|
| 544 |
+
" nRoot : root\n",
|
| 545 |
+
" \"\"\"\n",
|
| 546 |
+
" rootValue = 1/float(nRoot)\n",
|
| 547 |
+
" return round (Decimal(value) ** Decimal(rootValue),3)\n",
|
| 548 |
+
"\n",
|
| 549 |
+
"def minkowskiDistance(x,y,pValue):\n",
|
| 550 |
+
" \"\"\"\n",
|
| 551 |
+
" minkowski distance\n",
|
| 552 |
+
" Parameters\n",
|
| 553 |
+
" x : first vector\n",
|
| 554 |
+
" y : second fvector\n",
|
| 555 |
+
" pValue : power factor\n",
|
| 556 |
+
" \"\"\"\n",
|
| 557 |
+
" return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)\n",
|
| 558 |
+
"\n",
|
| 559 |
+
"def jaccardSimilarityX(x,y):\n",
|
| 560 |
+
" \"\"\"\n",
|
| 561 |
+
" jaccard similarity\n",
|
| 562 |
+
" Parameters\n",
|
| 563 |
+
" x : first vector\n",
|
| 564 |
+
" y : second fvector\n",
|
| 565 |
+
" \"\"\"\n",
|
| 566 |
+
" intersectionCardinality = len(set.intersection(*[set(x), set(y)]))\n",
|
| 567 |
+
" unionCardinality = len(set.union(*[set(x), set(y)]))\n",
|
| 568 |
+
" return intersectionCardinality/float(unionCardinality)\n",
|
| 569 |
+
"\n",
|
| 570 |
+
"def jaccardSimilarity(x,y,wx=1.0,wy=1.0):\n",
|
| 571 |
+
" \"\"\"\n",
|
| 572 |
+
" jaccard similarity\n",
|
| 573 |
+
"\n",
|
| 574 |
+
" Parameters\n",
|
| 575 |
+
" x : first vector\n",
|
| 576 |
+
" y : second fvector\n",
|
| 577 |
+
" wx : weight for x\n",
|
| 578 |
+
" wy : weight for y\n",
|
| 579 |
+
" \"\"\"\n",
|
| 580 |
+
" sx = set(x)\n",
|
| 581 |
+
" sy = set(y)\n",
|
| 582 |
+
" sxyInt = sx.intersection(sy)\n",
|
| 583 |
+
" intCardinality = len(sxyInt)\n",
|
| 584 |
+
" sxIntDiff = sx.difference(sxyInt)\n",
|
| 585 |
+
" syIntDiff = sy.difference(sxyInt)\n",
|
| 586 |
+
" unionCardinality = len(sx.union(sy))\n",
|
| 587 |
+
" return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))\n",
|
| 588 |
+
"\n",
|
| 589 |
+
"def levenshteinSimilarity(s1, s2):\n",
|
| 590 |
+
" \"\"\"\n",
|
| 591 |
+
" Levenshtein similarity for strings\n",
|
| 592 |
+
"\n",
|
| 593 |
+
" Parameters\n",
|
| 594 |
+
" sx : first string\n",
|
| 595 |
+
" sy : second string\n",
|
| 596 |
+
" \"\"\"\n",
|
| 597 |
+
" assert type(s1) == str and type(s2) == str, \"Levenshtein similarity is for string only\"\n",
|
| 598 |
+
" d = ld(s1,s2)\n",
|
| 599 |
+
" #print(d)\n",
|
| 600 |
+
" l = max(len(s1),len(s2))\n",
|
| 601 |
+
" d = 1.0 - min(d/l, 1.0)\n",
|
| 602 |
+
" return d\t\n",
|
| 603 |
+
"\n",
|
| 604 |
+
"def norm(values, po=2):\n",
|
| 605 |
+
" \"\"\"\n",
|
| 606 |
+
" norm\n",
|
| 607 |
+
" Parameters\n",
|
| 608 |
+
" values : list of values\n",
|
| 609 |
+
" po : power\n",
|
| 610 |
+
" \"\"\"\n",
|
| 611 |
+
" no = sum(list(map(lambda v: pow(v,po), values)))\n",
|
| 612 |
+
" no = pow(no,1.0/po)\n",
|
| 613 |
+
" return list(map(lambda v: v/no, values))\n",
|
| 614 |
+
"\n",
|
| 615 |
+
"def createOneHotVec(size, indx = -1):\n",
|
| 616 |
+
" \"\"\"\n",
|
| 617 |
+
" random one hot vector\n",
|
| 618 |
+
"\n",
|
| 619 |
+
" Parameters\n",
|
| 620 |
+
" size : vector size\n",
|
| 621 |
+
" indx : one hot position\n",
|
| 622 |
+
" \"\"\"\n",
|
| 623 |
+
" vec = [0] * size\n",
|
| 624 |
+
" s = random.randint(0, size - 1) if indx < 0 else indx\n",
|
| 625 |
+
" vec[s] = 1\n",
|
| 626 |
+
" return vec\n",
|
| 627 |
+
"\n",
|
| 628 |
+
"def createAllOneHotVec(size):\n",
|
| 629 |
+
" \"\"\"\n",
|
| 630 |
+
" create all one hot vectors\n",
|
| 631 |
+
"\n",
|
| 632 |
+
" Parameters\n",
|
| 633 |
+
" size : vector size and no of vectors\n",
|
| 634 |
+
" \"\"\"\n",
|
| 635 |
+
" vecs = list()\n",
|
| 636 |
+
" for i in range(size):\n",
|
| 637 |
+
" vec = [0] * size\n",
|
| 638 |
+
" vec[i] = 1\n",
|
| 639 |
+
" vecs.append(vec)\n",
|
| 640 |
+
" return vecs\n",
|
| 641 |
+
"\n",
|
| 642 |
+
"def blockShuffle(data, blockSize):\n",
|
| 643 |
+
" \"\"\"\n",
|
| 644 |
+
" block shuffle \t\n",
|
| 645 |
+
"\n",
|
| 646 |
+
" Parameters\n",
|
| 647 |
+
" data : list data\n",
|
| 648 |
+
" blockSize : block size\n",
|
| 649 |
+
" \"\"\"\n",
|
| 650 |
+
" numBlock = int(len(data) / blockSize)\n",
|
| 651 |
+
" remain = len(data) % blockSize\n",
|
| 652 |
+
" numBlock += (1 if remain > 0 else 0)\n",
|
| 653 |
+
" shuffled = list()\n",
|
| 654 |
+
" for i in range(numBlock):\n",
|
| 655 |
+
" b = random.randint(0, numBlock-1)\n",
|
| 656 |
+
" beg = b * blockSize\n",
|
| 657 |
+
" if (b < numBlock-1):\n",
|
| 658 |
+
" end = beg + blockSize\n",
|
| 659 |
+
" shuffled.extend(data[beg:end])\t\t\n",
|
| 660 |
+
" else:\n",
|
| 661 |
+
" shuffled.extend(data[beg:])\n",
|
| 662 |
+
" return shuffled\t\n",
|
| 663 |
+
"\n",
|
| 664 |
+
"def shuffle(data, numShuffle):\n",
|
| 665 |
+
" \"\"\"\n",
|
| 666 |
+
" shuffle data by randonm swapping\n",
|
| 667 |
+
"\n",
|
| 668 |
+
" Parameters\n",
|
| 669 |
+
" data : list data\n",
|
| 670 |
+
" numShuffle : no of pairwise swaps\n",
|
| 671 |
+
" \"\"\"\n",
|
| 672 |
+
" sz = len(data)\n",
|
| 673 |
+
" if numShuffle is None:\n",
|
| 674 |
+
" numShuffle = int(sz / 2)\n",
|
| 675 |
+
" for i in range(numShuffle):\n",
|
| 676 |
+
" fi = random.randint(0, sz -1)\n",
|
| 677 |
+
" se = random.randint(0, sz -1)\n",
|
| 678 |
+
" tmp = data[fi]\n",
|
| 679 |
+
" data[fi] = data[se]\n",
|
| 680 |
+
" data[se] = tmp\t\n",
|
| 681 |
+
"\n",
|
| 682 |
+
"def randomWalk(size, start, lowStep, highStep):\n",
|
| 683 |
+
" \"\"\"\n",
|
| 684 |
+
" random walk\t\n",
|
| 685 |
+
"\n",
|
| 686 |
+
" Parameters\n",
|
| 687 |
+
" size : list data\n",
|
| 688 |
+
" start : initial position\n",
|
| 689 |
+
" lowStep : step min\n",
|
| 690 |
+
" highStep : step max\n",
|
| 691 |
+
" \"\"\"\n",
|
| 692 |
+
" cur = start\n",
|
| 693 |
+
" for i in range(size):\n",
|
| 694 |
+
" yield cur\n",
|
| 695 |
+
" cur += randomFloat(lowStep, highStep)\n",
|
| 696 |
+
"\n",
|
| 697 |
+
"def binaryEcodeCategorical(values, value):\n",
|
| 698 |
+
" \"\"\"\n",
|
| 699 |
+
" one hot binary encoding\t\n",
|
| 700 |
+
"\n",
|
| 701 |
+
" Parameters\n",
|
| 702 |
+
" values : list of values\n",
|
| 703 |
+
" value : value to be replaced with 1\n",
|
| 704 |
+
" \"\"\"\n",
|
| 705 |
+
" size = len(values)\n",
|
| 706 |
+
" vec = [0] * size\n",
|
| 707 |
+
" for i in range(size):\n",
|
| 708 |
+
" if (values[i] == value):\n",
|
| 709 |
+
" vec[i] = 1\n",
|
| 710 |
+
" return vec\t\t\n",
|
| 711 |
+
"\n",
|
| 712 |
+
"def createLabeledSeq(inputData, tw):\n",
|
| 713 |
+
" \"\"\"\n",
|
| 714 |
+
" Creates feature, label pair from sequence data, where we have tw number of features followed by output\n",
|
| 715 |
+
"\n",
|
| 716 |
+
" Parameters\n",
|
| 717 |
+
" values : list containing feature and label\n",
|
| 718 |
+
" tw : no of features\n",
|
| 719 |
+
" \"\"\"\n",
|
| 720 |
+
" features = list()\n",
|
| 721 |
+
" labels = list()\n",
|
| 722 |
+
" l = len(inputDta)\n",
|
| 723 |
+
" for i in range(l - tw):\n",
|
| 724 |
+
" trainSeq = inputData[i:i+tw]\n",
|
| 725 |
+
" trainLabel = inputData[i+tw]\n",
|
| 726 |
+
" features.append(trainSeq)\n",
|
| 727 |
+
" labels.append(trainLabel)\n",
|
| 728 |
+
" return (features, labels)\n",
|
| 729 |
+
"\n",
|
| 730 |
+
"def createLabeledSeq(filePath, delim, index, tw):\n",
|
| 731 |
+
" \"\"\"\n",
|
| 732 |
+
" Creates feature, label pair from 1D sequence data in file\t\n",
|
| 733 |
+
"\n",
|
| 734 |
+
" Parameters\n",
|
| 735 |
+
" filePath : file path\n",
|
| 736 |
+
" delim : delemeter\n",
|
| 737 |
+
" index : column index\n",
|
| 738 |
+
" tw : no of features\n",
|
| 739 |
+
" \"\"\"\n",
|
| 740 |
+
" seqData = getFileColumnAsFloat(filePath, delim, index)\n",
|
| 741 |
+
" return createLabeledSeq(seqData, tw)\n",
|
| 742 |
+
"\n",
|
| 743 |
+
"def fromMultDimSeqToTabular(data, inpSize, seqLen):\n",
|
| 744 |
+
" \"\"\"\n",
|
| 745 |
+
" Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)\n",
|
| 746 |
+
"\n",
|
| 747 |
+
" Parameters\n",
|
| 748 |
+
" data : 2D array\n",
|
| 749 |
+
" inpSize : each input size in sequence\n",
|
| 750 |
+
" seqLen : sequence length\n",
|
| 751 |
+
" \"\"\"\t\n",
|
| 752 |
+
" nrow = data.shape[0]\n",
|
| 753 |
+
" assert data.shape[1] == inpSize * seqLen, \"invalid input size or sequence length\"\n",
|
| 754 |
+
" return data.reshape(nrow * seqLen, inpSize)\n",
|
| 755 |
+
"\n",
|
| 756 |
+
"def fromTabularToMultDimSeq(data, inpSize, seqLen):\n",
|
| 757 |
+
" \"\"\"\n",
|
| 758 |
+
" Input shape (nrow * seqLen, inpSize) output shape (nrow, inpSize * seqLen) \n",
|
| 759 |
+
" Parameters\n",
|
| 760 |
+
" data : 2D array\n",
|
| 761 |
+
" inpSize : each input size in sequence\n",
|
| 762 |
+
" seqLen : sequence length\n",
|
| 763 |
+
" \"\"\"\t\n",
|
| 764 |
+
" nrow = int(data.shape[0] / seqLen)\n",
|
| 765 |
+
" assert data.shape[1] == inpSize, \"invalid input size\"\n",
|
| 766 |
+
" return data.reshape(nrow, seqLen * inpSize)\n",
|
| 767 |
+
"\n",
|
| 768 |
+
"def difference(data, interval=1):\n",
|
| 769 |
+
" \"\"\"\n",
|
| 770 |
+
" takes difference in time series data\n",
|
| 771 |
+
" Parameters\n",
|
| 772 |
+
" data :list data\n",
|
| 773 |
+
" interval : interval for difference\n",
|
| 774 |
+
" \"\"\"\n",
|
| 775 |
+
" diff = list()\n",
|
| 776 |
+
" for i in range(interval, len(data)):\n",
|
| 777 |
+
" value = data[i] - data[i - interval]\n",
|
| 778 |
+
" diff.append(value)\n",
|
| 779 |
+
" return diff\n",
|
| 780 |
+
"\n",
|
| 781 |
+
"def normalizeMatrix(data, norm, axis=1):\n",
|
| 782 |
+
" \"\"\"\n",
|
| 783 |
+
" normalized each row of the matrix\n",
|
| 784 |
+
"\n",
|
| 785 |
+
" Parameters\n",
|
| 786 |
+
" data : 2D data\n",
|
| 787 |
+
" nporm : normalization method\n",
|
| 788 |
+
" axis : row or column\n",
|
| 789 |
+
" \"\"\"\n",
|
| 790 |
+
" normalized = preprocessing.normalize(data,norm=norm, axis=axis)\n",
|
| 791 |
+
" return normalized\n",
|
| 792 |
+
"\n",
|
| 793 |
+
"def standardizeMatrix(data, axis=0):\n",
|
| 794 |
+
" \"\"\"\n",
|
| 795 |
+
" standardizes each column of the matrix with mean and std deviation\n",
|
| 796 |
+
" Parameters\n",
|
| 797 |
+
" data : 2D data\n",
|
| 798 |
+
" axis : row or column\n",
|
| 799 |
+
" \"\"\"\n",
|
| 800 |
+
" standardized = preprocessing.scale(data, axis=axis)\n",
|
| 801 |
+
" return standardized\n",
|
| 802 |
+
"\n",
|
| 803 |
+
"def asNumpyArray(data):\n",
|
| 804 |
+
" \"\"\"\n",
|
| 805 |
+
" converts to numpy array\n",
|
| 806 |
+
" Parameters\n",
|
| 807 |
+
" data : array\n",
|
| 808 |
+
" \"\"\"\n",
|
| 809 |
+
" return np.array(data)\n",
|
| 810 |
+
"\n",
|
| 811 |
+
"def perfMetric(metric, yActual, yPred, clabels=None):\n",
|
| 812 |
+
" \"\"\"\n",
|
| 813 |
+
" predictive model accuracy metric\n",
|
| 814 |
+
" Parameters\n",
|
| 815 |
+
" metric : accuracy metric\n",
|
| 816 |
+
" yActual : actual values array\n",
|
| 817 |
+
" yPred : predicted values array\n",
|
| 818 |
+
" clabels : class labels\n",
|
| 819 |
+
" \"\"\"\n",
|
| 820 |
+
" if metric == \"rsquare\":\n",
|
| 821 |
+
" score = metrics.r2_score(yActual, yPred)\n",
|
| 822 |
+
" elif metric == \"mae\":\n",
|
| 823 |
+
" score = metrics.mean_absolute_error(yActual, yPred)\n",
|
| 824 |
+
" elif metric == \"mse\":\n",
|
| 825 |
+
" score = metrics.mean_squared_error(yActual, yPred)\n",
|
| 826 |
+
" elif metric == \"acc\":\n",
|
| 827 |
+
" yPred = np.rint(yPred)\n",
|
| 828 |
+
" score = metrics.accuracy_score(yActual, yPred)\n",
|
| 829 |
+
" elif metric == \"mlAcc\":\n",
|
| 830 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 831 |
+
" score = metrics.accuracy_score(yActual, yPred)\n",
|
| 832 |
+
" elif metric == \"prec\":\n",
|
| 833 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 834 |
+
" score = metrics.precision_score(yActual, yPred)\n",
|
| 835 |
+
" elif metric == \"rec\":\n",
|
| 836 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 837 |
+
" score = metrics.recall_score(yActual, yPred)\n",
|
| 838 |
+
" elif metric == \"fone\":\n",
|
| 839 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 840 |
+
" score = metrics.f1_score(yActual, yPred)\n",
|
| 841 |
+
" elif metric == \"confm\":\n",
|
| 842 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 843 |
+
" score = metrics.confusion_matrix(yActual, yPred)\n",
|
| 844 |
+
" elif metric == \"clarep\":\n",
|
| 845 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 846 |
+
" score = metrics.classification_report(yActual, yPred)\n",
|
| 847 |
+
" elif metric == \"bce\":\n",
|
| 848 |
+
" if clabels is None:\n",
|
| 849 |
+
" clabels = [0, 1]\n",
|
| 850 |
+
" score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
|
| 851 |
+
" elif metric == \"ce\":\n",
|
| 852 |
+
" assert clabels is not None, \"labels must be provided\"\n",
|
| 853 |
+
" score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
|
| 854 |
+
" else:\n",
|
| 855 |
+
" exitWithMsg(\"invalid prediction performance metric \" + metric)\n",
|
| 856 |
+
" return score\n",
|
| 857 |
+
"\n",
|
| 858 |
+
"def scaleData(data, method):\n",
|
| 859 |
+
" \"\"\"\n",
|
| 860 |
+
" scales feature data column wise\n",
|
| 861 |
+
" Parameters\n",
|
| 862 |
+
" data : 2D array\n",
|
| 863 |
+
" method : scaling method\n",
|
| 864 |
+
" \"\"\"\n",
|
| 865 |
+
" if method == \"minmax\":\n",
|
| 866 |
+
" scaler = preprocessing.MinMaxScaler()\n",
|
| 867 |
+
" data = scaler.fit_transform(data)\n",
|
| 868 |
+
" elif method == \"zscale\":\n",
|
| 869 |
+
" data = preprocessing.scale(data)\t\n",
|
| 870 |
+
" else:\n",
|
| 871 |
+
" raise ValueError(\"invalid scaling method\")\t\n",
|
| 872 |
+
" return data\n",
|
| 873 |
+
"\n",
|
| 874 |
+
"def scaleDataWithParams(data, method, scParams):\n",
|
| 875 |
+
" \"\"\"\n",
|
| 876 |
+
" scales feature data column wise\n",
|
| 877 |
+
" Parameters\n",
|
| 878 |
+
" data : 2D array\n",
|
| 879 |
+
" method : scaling method\n",
|
| 880 |
+
" scParams : scaling parameters\n",
|
| 881 |
+
" \"\"\"\n",
|
| 882 |
+
" if method == \"minmax\":\n",
|
| 883 |
+
" data = scaleMinMaxTabData(data, scParams)\n",
|
| 884 |
+
" elif method == \"zscale\":\n",
|
| 885 |
+
" raise ValueError(\"invalid scaling method\")\t\n",
|
| 886 |
+
" else:\n",
|
| 887 |
+
" raise ValueError(\"invalid scaling method\")\t\n",
|
| 888 |
+
" return data\n",
|
| 889 |
+
"\n",
|
| 890 |
+
"\n",
|
| 891 |
+
"def scaleMinMaxTabData(tdata, minMax):\n",
|
| 892 |
+
" \"\"\"\n",
|
| 893 |
+
" for tabular scales feature data column wise using min max values for each field\n",
|
| 894 |
+
" Parameters\n",
|
| 895 |
+
" tdata : 2D array\n",
|
| 896 |
+
" minMax : ni, max and range for each column\n",
|
| 897 |
+
" \"\"\"\n",
|
| 898 |
+
" stdata = list()\n",
|
| 899 |
+
" for r in tdata:\n",
|
| 900 |
+
" srdata = list()\n",
|
| 901 |
+
" for i, c in enumerate(r):\n",
|
| 902 |
+
" sd = (c - minMax[i][0]) / minMax[i][2]\n",
|
| 903 |
+
" srdata.append(sd)\n",
|
| 904 |
+
" stdata.append(srdata)\n",
|
| 905 |
+
" return stdata\n",
|
| 906 |
+
"\n",
|
| 907 |
+
"def scaleMinMax(rdata, minMax):\n",
|
| 908 |
+
" \"\"\"\n",
|
| 909 |
+
" scales feature data column wise using min max values for each field\n",
|
| 910 |
+
" Parameters\n",
|
| 911 |
+
" rdata : data array\n",
|
| 912 |
+
" minMax : ni, max and range for each column\n",
|
| 913 |
+
" \"\"\"\n",
|
| 914 |
+
" srdata = list()\n",
|
| 915 |
+
" for i in range(len(rdata)):\n",
|
| 916 |
+
" d = rdata[i]\n",
|
| 917 |
+
" sd = (d - minMax[i][0]) / minMax[i][2]\n",
|
| 918 |
+
" srdata.append(sd)\n",
|
| 919 |
+
" return srdata\n",
|
| 920 |
+
"\n",
|
| 921 |
+
"def harmonicNum(n):\n",
|
| 922 |
+
" \"\"\"\n",
|
| 923 |
+
" harmonic number\n",
|
| 924 |
+
" Parameters\n",
|
| 925 |
+
" n : number\n",
|
| 926 |
+
" \"\"\"\n",
|
| 927 |
+
" h = 0\n",
|
| 928 |
+
" for i in range(1, n+1, 1):\n",
|
| 929 |
+
" h += 1.0 / i\n",
|
| 930 |
+
" return h\n",
|
| 931 |
+
"\n",
|
| 932 |
+
"def digammaFun(n):\n",
|
| 933 |
+
" \"\"\"\n",
|
| 934 |
+
" figamma function\n",
|
| 935 |
+
" Parameters\n",
|
| 936 |
+
" n : number\n",
|
| 937 |
+
" \"\"\"\n",
|
| 938 |
+
" #Euler Mascheroni constant\n",
|
| 939 |
+
" ec = 0.577216\n",
|
| 940 |
+
" return harmonicNum(n - 1) - ec\n",
|
| 941 |
+
"\n",
|
| 942 |
+
"def getDataPartitions(tdata, types, columns = None):\n",
|
| 943 |
+
" \"\"\"\n",
|
| 944 |
+
" partitions data with the given columns and random split point defined with predicates\n",
|
| 945 |
+
" Parameters\n",
|
| 946 |
+
" tdata : 2D array\n",
|
| 947 |
+
" types : data typers\n",
|
| 948 |
+
" columns : column indexes\n",
|
| 949 |
+
" \"\"\"\n",
|
| 950 |
+
" (dtypes, cvalues) = extractTypesFromString(types)\n",
|
| 951 |
+
" if columns is None:\n",
|
| 952 |
+
" ncol = len(data[0])\n",
|
| 953 |
+
" columns = list(range(ncol))\n",
|
| 954 |
+
" ncol = len(columns)\n",
|
| 955 |
+
" #print(columns)\n",
|
| 956 |
+
"\n",
|
| 957 |
+
" # partition predicates\n",
|
| 958 |
+
" partitions = None\n",
|
| 959 |
+
" for c in columns:\n",
|
| 960 |
+
" #print(c)\n",
|
| 961 |
+
" dtype = dtypes[c]\n",
|
| 962 |
+
" pred = list()\n",
|
| 963 |
+
" if dtype == \"int\" or dtype == \"float\":\n",
|
| 964 |
+
" (vmin, vmax) = getColMinMax(tdata, c)\n",
|
| 965 |
+
" r = vmax - vmin\n",
|
| 966 |
+
" rmin = vmin + .2 * r\n",
|
| 967 |
+
" rmax = vmax - .2 * r\n",
|
| 968 |
+
" sp = randomFloat(rmin, rmax)\n",
|
| 969 |
+
" if dtype == \"int\":\n",
|
| 970 |
+
" sp = int(sp)\n",
|
| 971 |
+
" else:\n",
|
| 972 |
+
" sp = \"{:.3f}\".format(sp)\n",
|
| 973 |
+
" sp = float(sp)\n",
|
| 974 |
+
" pred.append([c, \"LT\", sp])\n",
|
| 975 |
+
" pred.append([c, \"GE\", sp])\n",
|
| 976 |
+
" elif dtype == \"cat\":\n",
|
| 977 |
+
" cv = cvalues[c]\n",
|
| 978 |
+
" card = len(cv) \n",
|
| 979 |
+
" if card < 3:\n",
|
| 980 |
+
" num = 1\n",
|
| 981 |
+
" else:\n",
|
| 982 |
+
" num = randomInt(1, card - 1)\n",
|
| 983 |
+
" sp = selectRandomSubListFromList(cv, num)\n",
|
| 984 |
+
" sp = \" \".join(sp)\n",
|
| 985 |
+
" pred.append([c, \"IN\", sp])\n",
|
| 986 |
+
" pred.append([c, \"NOTIN\", sp])\n",
|
| 987 |
+
"\n",
|
| 988 |
+
" #print(pred)\n",
|
| 989 |
+
" if partitions is None:\n",
|
| 990 |
+
" partitions = pred.copy()\n",
|
| 991 |
+
" #print(\"initial\")\n",
|
| 992 |
+
" #print(partitions)\n",
|
| 993 |
+
" else:\n",
|
| 994 |
+
" #print(\"extension\")\n",
|
| 995 |
+
" tparts = list()\n",
|
| 996 |
+
" for p in partitions:\n",
|
| 997 |
+
" #print(p)\n",
|
| 998 |
+
" l1 = p.copy()\n",
|
| 999 |
+
" l1.extend(pred[0])\n",
|
| 1000 |
+
" l2 = p.copy()\n",
|
| 1001 |
+
" l2.extend(pred[1])\n",
|
| 1002 |
+
" #print(\"after extension\")\n",
|
| 1003 |
+
" #print(l1)\n",
|
| 1004 |
+
" #print(l2)\n",
|
| 1005 |
+
" tparts.append(l1)\n",
|
| 1006 |
+
" tparts.append(l2)\n",
|
| 1007 |
+
" partitions = tparts\t\n",
|
| 1008 |
+
" #print(\"extending\")\n",
|
| 1009 |
+
" #print(partitions)\n",
|
| 1010 |
+
"\n",
|
| 1011 |
+
" #for p in partitions:\n",
|
| 1012 |
+
" #print(p)\t\n",
|
| 1013 |
+
" return partitions\t\t\t\n",
|
| 1014 |
+
"\n",
|
| 1015 |
+
"def genAlmostUniformDistr(size, nswap=50):\n",
|
| 1016 |
+
" \"\"\"\n",
|
| 1017 |
+
" generate probability distribution\n",
|
| 1018 |
+
"\n",
|
| 1019 |
+
" Parameters\n",
|
| 1020 |
+
" size : distr size\n",
|
| 1021 |
+
" nswap : no of mass swaps\n",
|
| 1022 |
+
" \"\"\"\n",
|
| 1023 |
+
" un = 1.0 / size\n",
|
| 1024 |
+
" distr = [un] * size\n",
|
| 1025 |
+
" distr = mutDistr(distr, 0.1 * un, nswap)\n",
|
| 1026 |
+
" return distr\n",
|
| 1027 |
+
"\n",
|
| 1028 |
+
"def mutDistr(distr, shift, nswap=50):\n",
|
| 1029 |
+
" \"\"\"\n",
|
| 1030 |
+
" mutates a probability distribution\n",
|
| 1031 |
+
"\n",
|
| 1032 |
+
" Parameters\n",
|
| 1033 |
+
" distr distribution\n",
|
| 1034 |
+
" shift : amount of shift for swap\n",
|
| 1035 |
+
" nswap : no of mass swaps\n",
|
| 1036 |
+
" \"\"\"\n",
|
| 1037 |
+
" size = len(distr)\n",
|
| 1038 |
+
" for _ in range(nswap):\n",
|
| 1039 |
+
" fi = randomInt(0, size -1)\n",
|
| 1040 |
+
" si = randomInt(0, size -1)\n",
|
| 1041 |
+
" while fi == si:\n",
|
| 1042 |
+
" fi = randomInt(0, size -1)\n",
|
| 1043 |
+
" si = randomInt(0, size -1)\n",
|
| 1044 |
+
"\n",
|
| 1045 |
+
" shift = randomFloat(0, shift)\n",
|
| 1046 |
+
" t = distr[fi]\n",
|
| 1047 |
+
" distr[fi] -= shift\n",
|
| 1048 |
+
" if (distr[fi] < 0):\n",
|
| 1049 |
+
" distr[fi] = 0.0\n",
|
| 1050 |
+
" shift = t\n",
|
| 1051 |
+
" distr[si] += shift\n",
|
| 1052 |
+
" return distr\n",
|
| 1053 |
+
"\n",
|
| 1054 |
+
"def generateBinDistribution(size, ntrue):\n",
|
| 1055 |
+
" \"\"\"\n",
|
| 1056 |
+
" generate binary array with some elements set to 1\n",
|
| 1057 |
+
"\n",
|
| 1058 |
+
" Parameters\n",
|
| 1059 |
+
" size : distr size\n",
|
| 1060 |
+
" ntrue : no of true values\n",
|
| 1061 |
+
" \"\"\"\n",
|
| 1062 |
+
" distr = [0] * size\n",
|
| 1063 |
+
" idxs = selectRandomSubListFromList(list(range(size)), ntrue)\n",
|
| 1064 |
+
" for i in idxs:\n",
|
| 1065 |
+
" distr[i] = 1\n",
|
| 1066 |
+
" return distr\n",
|
| 1067 |
+
"\n",
|
| 1068 |
+
"def mutBinaryDistr(distr, nmut):\n",
|
| 1069 |
+
" \"\"\"\n",
|
| 1070 |
+
" mutate binary distribution\n",
|
| 1071 |
+
"\n",
|
| 1072 |
+
" Parameters\n",
|
| 1073 |
+
" distr : distr\n",
|
| 1074 |
+
" nmut : no of mutations\n",
|
| 1075 |
+
" \"\"\"\n",
|
| 1076 |
+
" idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)\n",
|
| 1077 |
+
" for i in idxs:\n",
|
| 1078 |
+
" distr[i] = distr[i] ^ 1\n",
|
| 1079 |
+
"\n",
|
| 1080 |
+
"\n",
|
| 1081 |
+
"def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=\",\"):\n",
|
| 1082 |
+
" \"\"\"\n",
|
| 1083 |
+
" file record generator that superimposes given data in the specified segment of a column\n",
|
| 1084 |
+
" Parameters\n",
|
| 1085 |
+
" filePath ; file path\n",
|
| 1086 |
+
" column : column index \n",
|
| 1087 |
+
" offset : offset into column values\n",
|
| 1088 |
+
" seqLen : length of subseq\n",
|
| 1089 |
+
" modifier : data to be superimposed either list or a sampler object\n",
|
| 1090 |
+
" precision : floating point precision\n",
|
| 1091 |
+
" delim : delemeter\n",
|
| 1092 |
+
" \"\"\"\n",
|
| 1093 |
+
" beg = offset\n",
|
| 1094 |
+
" end = beg + seqLen\n",
|
| 1095 |
+
" isList = type(modifier) == list\n",
|
| 1096 |
+
" i = 0\n",
|
| 1097 |
+
" for rec in fileRecGen(filePath, delim):\n",
|
| 1098 |
+
" if i >= beg and i < end:\n",
|
| 1099 |
+
" va = float(rec[column])\n",
|
| 1100 |
+
" if isList:\n",
|
| 1101 |
+
" va += modifier[i - beg] \n",
|
| 1102 |
+
" else:\n",
|
| 1103 |
+
" va += modifier.sample()\n",
|
| 1104 |
+
" rec[column] = formatFloat(precision, va)\n",
|
| 1105 |
+
" yield delim.join(rec)\n",
|
| 1106 |
+
" i += 1\n",
|
| 1107 |
+
"\n",
|
| 1108 |
+
"class ShiftedDataGenerator:\n",
|
| 1109 |
+
" \"\"\"\n",
|
| 1110 |
+
" transforms data for distribution shift\n",
|
| 1111 |
+
" \"\"\"\n",
|
| 1112 |
+
" def __init__(self, types, tdata, addFact, multFact):\n",
|
| 1113 |
+
" \"\"\"\n",
|
| 1114 |
+
" initializer\n",
|
| 1115 |
+
"\n",
|
| 1116 |
+
" Parameters\n",
|
| 1117 |
+
" types data types\n",
|
| 1118 |
+
" tdata : 2D array\n",
|
| 1119 |
+
" addFact ; factor for data shift\n",
|
| 1120 |
+
" multFact ; factor for data scaling\n",
|
| 1121 |
+
" \"\"\"\n",
|
| 1122 |
+
" (self.dtypes, self.cvalues) = extractTypesFromString(types)\n",
|
| 1123 |
+
"\n",
|
| 1124 |
+
" self.limits = dict()\n",
|
| 1125 |
+
" for k,v in self.dtypes.items():\n",
|
| 1126 |
+
" if v == \"int\" or v == \"false\":\n",
|
| 1127 |
+
" (vmax, vmin) = getColMinMax(tdata, k)\n",
|
| 1128 |
+
" self.limits[k] = vmax - vmin\n",
|
| 1129 |
+
" self.addMin = - addFact / 2\n",
|
| 1130 |
+
" self.addMax = addFact / 2\n",
|
| 1131 |
+
" self.multMin = 1.0 - multFact / 2\n",
|
| 1132 |
+
" self.multMax = 1.0 + multFact / 2\n",
|
| 1133 |
+
"\n",
|
| 1134 |
+
"\n",
|
| 1135 |
+
"\n",
|
| 1136 |
+
"\n",
|
| 1137 |
+
" def transform(self, tdata):\n",
|
| 1138 |
+
" \"\"\"\n",
|
| 1139 |
+
" linear transforms data to create distribution shift with random shift and scale\n",
|
| 1140 |
+
" Parameters\n",
|
| 1141 |
+
" types : data types\n",
|
| 1142 |
+
" \"\"\"\n",
|
| 1143 |
+
" transforms = dict()\n",
|
| 1144 |
+
" for k,v in self.dtypes.items():\n",
|
| 1145 |
+
" if v == \"int\" or v == \"false\":\t\t\t\t\n",
|
| 1146 |
+
" shift = randomFloat(self.addMin, self.addMax) * self.limits[k] \n",
|
| 1147 |
+
" scale = randomFloat(self.multMin, self.multMax)\n",
|
| 1148 |
+
" trns = (shift, scale)\n",
|
| 1149 |
+
" transforms[k] = trns\n",
|
| 1150 |
+
" elif v == \"cat\":\n",
|
| 1151 |
+
" transforms[k] = isEventSampled(50)\n",
|
| 1152 |
+
"\n",
|
| 1153 |
+
" ttdata = list()\n",
|
| 1154 |
+
" for rec in tdata:\n",
|
| 1155 |
+
" nrec = rec.copy()\n",
|
| 1156 |
+
" for c in range(len(rec)):\n",
|
| 1157 |
+
" if c in self.dtypes:\n",
|
| 1158 |
+
" dtype = self.dtypes[c]\n",
|
| 1159 |
+
" if dtype == \"int\" or dtype == \"float\":\n",
|
| 1160 |
+
" (shift, scale) = transforms[c]\n",
|
| 1161 |
+
" nval = shift + rec[c] * scale\n",
|
| 1162 |
+
" if dtype == \"int\":\n",
|
| 1163 |
+
" nrec[c] = int(nval)\n",
|
| 1164 |
+
" else:\n",
|
| 1165 |
+
" nrec[c] = nval\n",
|
| 1166 |
+
" elif dtype == \"cat\":\n",
|
| 1167 |
+
" cv = self.cvalues[c]\n",
|
| 1168 |
+
" if transforms[c]:\n",
|
| 1169 |
+
" nval = selectOtherRandomFromList(cv, rec[c])\n",
|
| 1170 |
+
" nrec[c] = nval\n",
|
| 1171 |
+
"\n",
|
| 1172 |
+
" ttdata.append(nrec)\n",
|
| 1173 |
+
"\n",
|
| 1174 |
+
" return ttdata\n",
|
| 1175 |
+
"\n",
|
| 1176 |
+
" def transformSpecified(self, tdata, sshift, scale):\n",
|
| 1177 |
+
" \"\"\"\n",
|
| 1178 |
+
" linear transforms data to create distribution shift shift specified shift and scale\n",
|
| 1179 |
+
" Parameters\n",
|
| 1180 |
+
" types : data types\n",
|
| 1181 |
+
" sshift : shift factor\n",
|
| 1182 |
+
" scale : scale factor\n",
|
| 1183 |
+
" \"\"\"\n",
|
| 1184 |
+
" transforms = dict()\n",
|
| 1185 |
+
" for k,v in self.dtypes.items():\n",
|
| 1186 |
+
" if v == \"int\" or v == \"false\":\t\t\t\t\n",
|
| 1187 |
+
" shift = sshift * self.limits[k] \n",
|
| 1188 |
+
" trns = (shift, scale)\n",
|
| 1189 |
+
" transforms[k] = trns\n",
|
| 1190 |
+
" elif v == \"cat\":\n",
|
| 1191 |
+
" transforms[k] = isEventSampled(50)\n",
|
| 1192 |
+
"\n",
|
| 1193 |
+
" ttdata = self.__scaleShift(tdata, transforms)\n",
|
| 1194 |
+
" return ttdata\n",
|
| 1195 |
+
"\n",
|
| 1196 |
+
" def __scaleShift(self, tdata, transforms):\n",
|
| 1197 |
+
" \"\"\"\n",
|
| 1198 |
+
" shifts and scales tabular data\n",
|
| 1199 |
+
"\n",
|
| 1200 |
+
" Parameters\n",
|
| 1201 |
+
" tdata : 2D array\n",
|
| 1202 |
+
" transforms : transforms to apply\n",
|
| 1203 |
+
" \"\"\"\n",
|
| 1204 |
+
" ttdata = list()\n",
|
| 1205 |
+
" for rec in tdata:\n",
|
| 1206 |
+
" nrec = rec.copy()\n",
|
| 1207 |
+
" for c in range(len(rec)):\n",
|
| 1208 |
+
" if c in self.dtypes:\n",
|
| 1209 |
+
" dtype = self.dtypes[c]\n",
|
| 1210 |
+
" if dtype == \"int\" or dtype == \"float\":\n",
|
| 1211 |
+
" (shift, scale) = transforms[c]\n",
|
| 1212 |
+
" nval = shift + rec[c] * scale\n",
|
| 1213 |
+
" if dtype == \"int\":\n",
|
| 1214 |
+
" nrec[c] = int(nval)\n",
|
| 1215 |
+
" else:\n",
|
| 1216 |
+
" nrec[c] = nval\n",
|
| 1217 |
+
" elif dtype == \"cat\":\n",
|
| 1218 |
+
" cv = self.cvalues[c]\n",
|
| 1219 |
+
" if transforms[c]:\n",
|
| 1220 |
+
" #nval = selectOtherRandomFromList(cv, rec[c])\n",
|
| 1221 |
+
" #nrec[c] = nval\n",
|
| 1222 |
+
" pass\n",
|
| 1223 |
+
"\n",
|
| 1224 |
+
" ttdata.append(nrec)\n",
|
| 1225 |
+
" return ttdata\n",
|
| 1226 |
+
"\n",
|
| 1227 |
+
"class RollingStat(object):\n",
|
| 1228 |
+
" \"\"\"\n",
|
| 1229 |
+
" stats for rolling windowt\n",
|
| 1230 |
+
" \"\"\"\n",
|
| 1231 |
+
" def __init__(self, wsize):\n",
|
| 1232 |
+
" \"\"\"\n",
|
| 1233 |
+
" initializer\n",
|
| 1234 |
+
"\n",
|
| 1235 |
+
" Parameters\n",
|
| 1236 |
+
" wsize : window size\n",
|
| 1237 |
+
" \"\"\"\n",
|
| 1238 |
+
" self.window = list()\n",
|
| 1239 |
+
" self.wsize = wsize\n",
|
| 1240 |
+
" self.mean = None\n",
|
| 1241 |
+
" self.sd = None\n",
|
| 1242 |
+
"\n",
|
| 1243 |
+
" def add(self, value):\n",
|
| 1244 |
+
" \"\"\"\n",
|
| 1245 |
+
" add a value\n",
|
| 1246 |
+
"\n",
|
| 1247 |
+
" Parameters\n",
|
| 1248 |
+
" value : value to add\n",
|
| 1249 |
+
" \"\"\"\n",
|
| 1250 |
+
" self.window.append(value)\n",
|
| 1251 |
+
" if len(self.window) > self.wsize:\n",
|
| 1252 |
+
" self.window = self.window[1:]\n",
|
| 1253 |
+
"\n",
|
| 1254 |
+
" def getStat(self):\n",
|
| 1255 |
+
" \"\"\"\n",
|
| 1256 |
+
" get rolling window mean and std deviation\n",
|
| 1257 |
+
" \"\"\"\n",
|
| 1258 |
+
" assertGreater(len(self.window), 0, \"window is empty\")\n",
|
| 1259 |
+
" if len(self.window) == 1:\n",
|
| 1260 |
+
" self.mean = self.window[0]\n",
|
| 1261 |
+
" self.sd = 0\n",
|
| 1262 |
+
" else:\n",
|
| 1263 |
+
" self.mean = statistics.mean(self.window)\n",
|
| 1264 |
+
" self.sd = statistics.stdev(self.window, xbar=self.mean)\n",
|
| 1265 |
+
" re = (self.mean, self.sd)\n",
|
| 1266 |
+
" return re\n",
|
| 1267 |
+
"\n",
|
| 1268 |
+
" def getSize(self):\n",
|
| 1269 |
+
" \"\"\"\n",
|
| 1270 |
+
" return window size\n",
|
| 1271 |
+
" \"\"\"\n",
|
| 1272 |
+
" return len(self.window)\n"
|
| 1273 |
+
]
|
| 1274 |
+
}
|
| 1275 |
+
],
|
| 1276 |
+
"metadata": {
|
| 1277 |
+
"kernelspec": {
|
| 1278 |
+
"display_name": "Python 3 (ipykernel)",
|
| 1279 |
+
"language": "python",
|
| 1280 |
+
"name": "python3"
|
| 1281 |
+
},
|
| 1282 |
+
"language_info": {
|
| 1283 |
+
"codemirror_mode": {
|
| 1284 |
+
"name": "ipython",
|
| 1285 |
+
"version": 3
|
| 1286 |
+
},
|
| 1287 |
+
"file_extension": ".py",
|
| 1288 |
+
"mimetype": "text/x-python",
|
| 1289 |
+
"name": "python",
|
| 1290 |
+
"nbconvert_exporter": "python",
|
| 1291 |
+
"pygments_lexer": "ipython3",
|
| 1292 |
+
"version": "3.9.12"
|
| 1293 |
+
}
|
| 1294 |
+
},
|
| 1295 |
+
"nbformat": 4,
|
| 1296 |
+
"nbformat_minor": 5
|
| 1297 |
+
}
|
lib/.ipynb_checkpoints/sampler-checkpoint.ipynb
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [],
|
| 3 |
+
"metadata": {},
|
| 4 |
+
"nbformat": 4,
|
| 5 |
+
"nbformat_minor": 5
|
| 6 |
+
}
|
lib/.ipynb_checkpoints/stats-checkpoint.ipynb
ADDED
|
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "f4cbab42",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import sys\n",
|
| 11 |
+
"import random \n",
|
| 12 |
+
"import time\n",
|
| 13 |
+
"import math\n",
|
| 14 |
+
"import numpy as np\n",
|
| 15 |
+
"import statistics \n",
|
| 16 |
+
"from util import *\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"\"\"\"\n",
|
| 19 |
+
"histogram class\n",
|
| 20 |
+
"\"\"\"\n",
|
| 21 |
+
"class Histogram:\n",
|
| 22 |
+
" def __init__(self, min, binWidth):\n",
|
| 23 |
+
" \"\"\"\n",
|
| 24 |
+
" initializer\n",
|
| 25 |
+
"\n",
|
| 26 |
+
" Parameters\n",
|
| 27 |
+
" min : min x\n",
|
| 28 |
+
" binWidth : bin width\n",
|
| 29 |
+
" \"\"\"\n",
|
| 30 |
+
" self.xmin = min\n",
|
| 31 |
+
" self.binWidth = binWidth\n",
|
| 32 |
+
" self.normalized = False\n",
|
| 33 |
+
"\n",
|
| 34 |
+
" @classmethod\n",
|
| 35 |
+
" def createInitialized(cls, xmin, binWidth, values):\n",
|
| 36 |
+
" \"\"\"\n",
|
| 37 |
+
" create histogram instance with min domain, bin width and values\n",
|
| 38 |
+
"\n",
|
| 39 |
+
" Parameters\n",
|
| 40 |
+
" min : min x\n",
|
| 41 |
+
" binWidth : bin width\n",
|
| 42 |
+
" values : y values\n",
|
| 43 |
+
" \"\"\"\n",
|
| 44 |
+
" instance = cls(xmin, binWidth)\n",
|
| 45 |
+
" instance.xmax = xmin + binWidth * (len(values) - 1)\n",
|
| 46 |
+
" instance.ymin = 0\n",
|
| 47 |
+
" instance.bins = np.array(values)\n",
|
| 48 |
+
" instance.fmax = 0\n",
|
| 49 |
+
" for v in values:\n",
|
| 50 |
+
" if (v > instance.fmax):\n",
|
| 51 |
+
" instance.fmax = v\n",
|
| 52 |
+
" instance.ymin = 0.0\n",
|
| 53 |
+
" instance.ymax = instance.fmax\n",
|
| 54 |
+
" return instance\n",
|
| 55 |
+
"\n",
|
| 56 |
+
" @classmethod\n",
|
| 57 |
+
" def createWithNumBins(cls, values, numBins=20):\n",
|
| 58 |
+
" \"\"\"\n",
|
| 59 |
+
" create histogram instance values and no of bins\n",
|
| 60 |
+
"\n",
|
| 61 |
+
" Parameters\n",
|
| 62 |
+
" values : y values\n",
|
| 63 |
+
" numBins : no of bins\n",
|
| 64 |
+
" \"\"\"\n",
|
| 65 |
+
" xmin = min(values)\n",
|
| 66 |
+
" xmax = max(values)\n",
|
| 67 |
+
" binWidth = (xmax + .01 - (xmin - .01)) / numBins\n",
|
| 68 |
+
" instance = cls(xmin, binWidth)\n",
|
| 69 |
+
" instance.xmax = xmax\n",
|
| 70 |
+
" instance.numBin = numBins\n",
|
| 71 |
+
" instance.bins = np.zeros(instance.numBin)\n",
|
| 72 |
+
" for v in values:\n",
|
| 73 |
+
" instance.add(v)\n",
|
| 74 |
+
" return instance\n",
|
| 75 |
+
"\n",
|
| 76 |
+
" @classmethod\n",
|
| 77 |
+
" def createUninitialized(cls, xmin, xmax, binWidth):\n",
|
| 78 |
+
" \"\"\"\n",
|
| 79 |
+
" create histogram instance with no y values using domain min , max and bin width\n",
|
| 80 |
+
"\n",
|
| 81 |
+
" Parameters\n",
|
| 82 |
+
" min : min x\n",
|
| 83 |
+
" max : max x\n",
|
| 84 |
+
" binWidth : bin width\n",
|
| 85 |
+
" \"\"\"\n",
|
| 86 |
+
" instance = cls(xmin, binWidth)\n",
|
| 87 |
+
" instance.xmax = xmax\n",
|
| 88 |
+
" instance.numBin = (xmax - xmin) / binWidth + 1\n",
|
| 89 |
+
" instance.bins = np.zeros(instance.numBin)\n",
|
| 90 |
+
" return instance\n",
|
| 91 |
+
"\n",
|
| 92 |
+
" def initialize(self):\n",
|
| 93 |
+
" \"\"\"\n",
|
| 94 |
+
" set y values to 0\n",
|
| 95 |
+
" \"\"\"\n",
|
| 96 |
+
" self.bins = np.zeros(self.numBin)\n",
|
| 97 |
+
"\n",
|
| 98 |
+
" def add(self, value):\n",
|
| 99 |
+
" \"\"\"\n",
|
| 100 |
+
" adds a value to a bin\n",
|
| 101 |
+
"\n",
|
| 102 |
+
" Parameters\n",
|
| 103 |
+
" value : value\n",
|
| 104 |
+
" \"\"\"\n",
|
| 105 |
+
" bin = int((value - self.xmin) / self.binWidth)\n",
|
| 106 |
+
" if (bin < 0 or bin > self.numBin - 1):\n",
|
| 107 |
+
" print (bin)\n",
|
| 108 |
+
" raise ValueError(\"outside histogram range\")\n",
|
| 109 |
+
" self.bins[bin] += 1.0\n",
|
| 110 |
+
"\n",
|
| 111 |
+
" def normalize(self):\n",
|
| 112 |
+
" \"\"\"\n",
|
| 113 |
+
" normalize bin counts\n",
|
| 114 |
+
" \"\"\"\n",
|
| 115 |
+
" if not self.normalized:\n",
|
| 116 |
+
" total = self.bins.sum()\n",
|
| 117 |
+
" self.bins = np.divide(self.bins, total)\n",
|
| 118 |
+
" self.normalized = True\n",
|
| 119 |
+
"\n",
|
| 120 |
+
" def cumDistr(self):\n",
|
| 121 |
+
" \"\"\"\n",
|
| 122 |
+
" cumulative dists\n",
|
| 123 |
+
" \"\"\"\n",
|
| 124 |
+
" self.normalize()\n",
|
| 125 |
+
" self.cbins = np.cumsum(self.bins)\n",
|
| 126 |
+
" return self.cbins\n",
|
| 127 |
+
"\n",
|
| 128 |
+
" def distr(self):\n",
|
| 129 |
+
" \"\"\"\n",
|
| 130 |
+
" distr\n",
|
| 131 |
+
" \"\"\"\n",
|
| 132 |
+
" self.normalize()\n",
|
| 133 |
+
" return self.bins\n",
|
| 134 |
+
"\n",
|
| 135 |
+
"\n",
|
| 136 |
+
" def percentile(self, percent):\n",
|
| 137 |
+
" \"\"\"\n",
|
| 138 |
+
" return value corresponding to a percentile\n",
|
| 139 |
+
"\n",
|
| 140 |
+
" Parameters\n",
|
| 141 |
+
" percent : percentile value\n",
|
| 142 |
+
" \"\"\"\n",
|
| 143 |
+
" if self.cbins is None:\n",
|
| 144 |
+
" raise ValueError(\"cumulative distribution is not available\")\n",
|
| 145 |
+
"\n",
|
| 146 |
+
" for i,cuml in enumerate(self.cbins):\n",
|
| 147 |
+
" if percent > cuml:\n",
|
| 148 |
+
" value = (i * self.binWidth) - (self.binWidth / 2) + \\\n",
|
| 149 |
+
" (percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1]) \n",
|
| 150 |
+
" break\n",
|
| 151 |
+
" return value\n",
|
| 152 |
+
"\n",
|
| 153 |
+
" def max(self):\n",
|
| 154 |
+
" \"\"\"\n",
|
| 155 |
+
" return max bin value \n",
|
| 156 |
+
" \"\"\"\n",
|
| 157 |
+
" return self.bins.max()\n",
|
| 158 |
+
"\n",
|
| 159 |
+
" def value(self, x):\n",
|
| 160 |
+
" \"\"\"\n",
|
| 161 |
+
" return a bin value\t\n",
|
| 162 |
+
"\n",
|
| 163 |
+
" Parameters\n",
|
| 164 |
+
" x : x value\n",
|
| 165 |
+
" \"\"\"\n",
|
| 166 |
+
" bin = int((x - self.xmin) / self.binWidth)\n",
|
| 167 |
+
" f = self.bins[bin]\n",
|
| 168 |
+
" return f\n",
|
| 169 |
+
"\n",
|
| 170 |
+
" def bin(self, x):\n",
|
| 171 |
+
" \"\"\"\n",
|
| 172 |
+
" return a bin index\t\n",
|
| 173 |
+
"\n",
|
| 174 |
+
" Parameters\n",
|
| 175 |
+
" x : x value\n",
|
| 176 |
+
" \"\"\"\n",
|
| 177 |
+
" return int((x - self.xmin) / self.binWidth)\n",
|
| 178 |
+
"\n",
|
| 179 |
+
" def cumValue(self, x):\n",
|
| 180 |
+
" \"\"\"\n",
|
| 181 |
+
" return a cumulative bin value\t\n",
|
| 182 |
+
"\n",
|
| 183 |
+
" Parameters\n",
|
| 184 |
+
" x : x value\n",
|
| 185 |
+
" \"\"\"\n",
|
| 186 |
+
" bin = int((x - self.xmin) / self.binWidth)\n",
|
| 187 |
+
" c = self.cbins[bin]\n",
|
| 188 |
+
" return c\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"\n",
|
| 191 |
+
" def getMinMax(self):\n",
|
| 192 |
+
" \"\"\"\n",
|
| 193 |
+
" returns x min and x max\n",
|
| 194 |
+
" \"\"\"\n",
|
| 195 |
+
" return (self.xmin, self.xmax)\n",
|
| 196 |
+
"\n",
|
| 197 |
+
" def boundedValue(self, x):\n",
|
| 198 |
+
" \"\"\"\n",
|
| 199 |
+
" return x bounde by min and max\t\n",
|
| 200 |
+
"\n",
|
| 201 |
+
" Parameters\n",
|
| 202 |
+
" x : x value\n",
|
| 203 |
+
" \"\"\"\n",
|
| 204 |
+
" if x < self.xmin:\n",
|
| 205 |
+
" x = self.xmin\n",
|
| 206 |
+
" elif x > self.xmax:\n",
|
| 207 |
+
" x = self.xmax\n",
|
| 208 |
+
" return x\n",
|
| 209 |
+
"\n",
|
| 210 |
+
"\"\"\"\n",
|
| 211 |
+
"categorical histogram class\n",
|
| 212 |
+
"\"\"\"\n",
|
| 213 |
+
"class CatHistogram:\n",
|
| 214 |
+
" def __init__(self):\n",
|
| 215 |
+
" \"\"\"\n",
|
| 216 |
+
" initializer\n",
|
| 217 |
+
" \"\"\"\n",
|
| 218 |
+
" self.binCounts = dict()\n",
|
| 219 |
+
" self.counts = 0\n",
|
| 220 |
+
" self.normalized = False\n",
|
| 221 |
+
"\n",
|
| 222 |
+
" def add(self, value):\n",
|
| 223 |
+
" \"\"\"\n",
|
| 224 |
+
" adds a value to a bin\n",
|
| 225 |
+
"\n",
|
| 226 |
+
" Parameters\n",
|
| 227 |
+
" x : x value\n",
|
| 228 |
+
" \"\"\"\n",
|
| 229 |
+
" addToKeyedCounter(self.binCounts, value)\n",
|
| 230 |
+
" self.counts += 1\t\n",
|
| 231 |
+
"\n",
|
| 232 |
+
" def normalize(self):\n",
|
| 233 |
+
" \"\"\"\n",
|
| 234 |
+
" normalize\n",
|
| 235 |
+
" \"\"\"\n",
|
| 236 |
+
" if not self.normalized:\n",
|
| 237 |
+
" self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))\n",
|
| 238 |
+
" self.normalized = True\n",
|
| 239 |
+
"\n",
|
| 240 |
+
" def getMode(self):\n",
|
| 241 |
+
" \"\"\"\n",
|
| 242 |
+
" get mode\n",
|
| 243 |
+
" \"\"\"\n",
|
| 244 |
+
" maxk = None\n",
|
| 245 |
+
" maxv = 0\n",
|
| 246 |
+
" #print(self.binCounts)\n",
|
| 247 |
+
" for k,v in self.binCounts.items():\n",
|
| 248 |
+
" if v > maxv:\n",
|
| 249 |
+
" maxk = k\n",
|
| 250 |
+
" maxv = v\n",
|
| 251 |
+
" return (maxk, maxv)\t\n",
|
| 252 |
+
"\n",
|
| 253 |
+
" def getEntropy(self):\n",
|
| 254 |
+
" \"\"\"\n",
|
| 255 |
+
" get entropy\n",
|
| 256 |
+
" \"\"\"\n",
|
| 257 |
+
" self.normalize()\n",
|
| 258 |
+
" entr = 0 \n",
|
| 259 |
+
" #print(self.binCounts)\n",
|
| 260 |
+
" for k,v in self.binCounts.items():\n",
|
| 261 |
+
" entr -= v * math.log(v)\n",
|
| 262 |
+
" return entr\n",
|
| 263 |
+
"\n",
|
| 264 |
+
" def getUniqueValues(self):\n",
|
| 265 |
+
" \"\"\"\n",
|
| 266 |
+
" get unique values\n",
|
| 267 |
+
" \"\"\"\t\t\n",
|
| 268 |
+
" return list(self.binCounts.keys())\n",
|
| 269 |
+
"\n",
|
| 270 |
+
" def getDistr(self):\n",
|
| 271 |
+
" \"\"\"\n",
|
| 272 |
+
" get distribution\n",
|
| 273 |
+
" \"\"\"\t\n",
|
| 274 |
+
" self.normalize()\t\n",
|
| 275 |
+
" return self.binCounts.copy()\n",
|
| 276 |
+
"\n",
|
| 277 |
+
"class RunningStat:\n",
|
| 278 |
+
" \"\"\"\n",
|
| 279 |
+
" running stat class\n",
|
| 280 |
+
" \"\"\"\n",
|
| 281 |
+
" def __init__(self):\n",
|
| 282 |
+
" \"\"\"\n",
|
| 283 |
+
" initializer\t\n",
|
| 284 |
+
" \"\"\"\n",
|
| 285 |
+
" self.sum = 0.0\n",
|
| 286 |
+
" self.sumSq = 0.0\n",
|
| 287 |
+
" self.count = 0\n",
|
| 288 |
+
"\n",
|
| 289 |
+
" @staticmethod\n",
|
| 290 |
+
" def create(count, sum, sumSq):\n",
|
| 291 |
+
" \"\"\"\n",
|
| 292 |
+
" creates iinstance\t\n",
|
| 293 |
+
"\n",
|
| 294 |
+
" Parameters\n",
|
| 295 |
+
" sum : sum of values\n",
|
| 296 |
+
" sumSq : sum of valure squared\n",
|
| 297 |
+
" \"\"\"\n",
|
| 298 |
+
" rs = RunningStat()\n",
|
| 299 |
+
" rs.sum = sum\n",
|
| 300 |
+
" rs.sumSq = sumSq\n",
|
| 301 |
+
" rs.count = count\n",
|
| 302 |
+
" return rs\n",
|
| 303 |
+
"\n",
|
| 304 |
+
" def add(self, value):\n",
|
| 305 |
+
" \"\"\"\n",
|
| 306 |
+
" adds new value\n",
|
| 307 |
+
" Parameters\n",
|
| 308 |
+
" value : value to add\n",
|
| 309 |
+
" \"\"\"\n",
|
| 310 |
+
" self.sum += value\n",
|
| 311 |
+
" self.sumSq += (value * value)\n",
|
| 312 |
+
" self.count += 1\n",
|
| 313 |
+
"\n",
|
| 314 |
+
" def getStat(self):\n",
|
| 315 |
+
" \"\"\"\n",
|
| 316 |
+
" return mean and std deviation \n",
|
| 317 |
+
" \"\"\"\n",
|
| 318 |
+
" mean = self.sum /self. count\n",
|
| 319 |
+
" t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
|
| 320 |
+
" sd = math.sqrt(t)\n",
|
| 321 |
+
" re = (mean, sd)\n",
|
| 322 |
+
" return re\n",
|
| 323 |
+
"\n",
|
| 324 |
+
" def addGetStat(self,value):\n",
|
| 325 |
+
" \"\"\"\n",
|
| 326 |
+
" calculate mean and std deviation with new value added\n",
|
| 327 |
+
" Parameters\n",
|
| 328 |
+
" value : value to add\n",
|
| 329 |
+
" \"\"\"\n",
|
| 330 |
+
" self.add(value)\n",
|
| 331 |
+
" re = self.getStat()\n",
|
| 332 |
+
" return re\n",
|
| 333 |
+
"\n",
|
| 334 |
+
" def getCount(self):\n",
|
| 335 |
+
" \"\"\"\n",
|
| 336 |
+
" return count\n",
|
| 337 |
+
" \"\"\"\n",
|
| 338 |
+
" return self.count\n",
|
| 339 |
+
"\n",
|
| 340 |
+
" def getState(self):\n",
|
| 341 |
+
" \"\"\"\n",
|
| 342 |
+
" return state\n",
|
| 343 |
+
" \"\"\"\n",
|
| 344 |
+
" s = (self.count, self.sum, self.sumSq)\n",
|
| 345 |
+
" return s\n",
|
| 346 |
+
"\n",
|
| 347 |
+
"class SlidingWindowStat:\n",
|
| 348 |
+
" \"\"\"\n",
|
| 349 |
+
" sliding window stats\n",
|
| 350 |
+
" \"\"\"\n",
|
| 351 |
+
" def __init__(self):\n",
|
| 352 |
+
" \"\"\"\n",
|
| 353 |
+
" initializer\n",
|
| 354 |
+
" \"\"\"\n",
|
| 355 |
+
" self.sum = 0.0\n",
|
| 356 |
+
" self.sumSq = 0.0\n",
|
| 357 |
+
" self.count = 0\n",
|
| 358 |
+
" self.values = None\n",
|
| 359 |
+
"\n",
|
| 360 |
+
" @staticmethod\n",
|
| 361 |
+
" def create(values, sum, sumSq):\n",
|
| 362 |
+
" \"\"\"\n",
|
| 363 |
+
" creates iinstance\t\n",
|
| 364 |
+
"\n",
|
| 365 |
+
" Parameters\n",
|
| 366 |
+
" sum : sum of values\n",
|
| 367 |
+
" sumSq : sum of valure squared\n",
|
| 368 |
+
" \"\"\"\n",
|
| 369 |
+
" sws = SlidingWindowStat()\n",
|
| 370 |
+
" sws.sum = sum\n",
|
| 371 |
+
" sws.sumSq = sumSq\n",
|
| 372 |
+
" self.values = values.copy()\n",
|
| 373 |
+
" sws.count = len(self.values)\n",
|
| 374 |
+
" return sws\n",
|
| 375 |
+
"\n",
|
| 376 |
+
" @staticmethod\n",
|
| 377 |
+
" def initialize(values):\n",
|
| 378 |
+
" \"\"\"\n",
|
| 379 |
+
" creates iinstance\t\n",
|
| 380 |
+
"\n",
|
| 381 |
+
" Parameters\n",
|
| 382 |
+
" values : list of values\n",
|
| 383 |
+
" \"\"\"\n",
|
| 384 |
+
" sws = SlidingWindowStat()\n",
|
| 385 |
+
" sws.values = values.copy()\n",
|
| 386 |
+
" for v in sws.values:\n",
|
| 387 |
+
" sws.sum += v\n",
|
| 388 |
+
" sws.sumSq += v * v\t\t\n",
|
| 389 |
+
" sws.count = len(sws.values)\n",
|
| 390 |
+
" return sws\n",
|
| 391 |
+
"\n",
|
| 392 |
+
" @staticmethod\n",
|
| 393 |
+
" def createEmpty(count):\n",
|
| 394 |
+
" \"\"\"\n",
|
| 395 |
+
" creates iinstance\t\n",
|
| 396 |
+
"\n",
|
| 397 |
+
" Parameters\n",
|
| 398 |
+
" count : count of values\n",
|
| 399 |
+
" \"\"\"\n",
|
| 400 |
+
" sws = SlidingWindowStat()\n",
|
| 401 |
+
" sws.count = count\n",
|
| 402 |
+
" sws.values = list()\n",
|
| 403 |
+
" return sws\n",
|
| 404 |
+
"\n",
|
| 405 |
+
" def add(self, value):\n",
|
| 406 |
+
" \"\"\"\n",
|
| 407 |
+
" adds new value\n",
|
| 408 |
+
"\n",
|
| 409 |
+
" Parameters\n",
|
| 410 |
+
" value : value to add\n",
|
| 411 |
+
" \"\"\"\n",
|
| 412 |
+
" self.values.append(value)\t\t\n",
|
| 413 |
+
" if len(self.values) > self.count:\n",
|
| 414 |
+
" self.sum += value - self.values[0]\n",
|
| 415 |
+
" self.sumSq += (value * value) - (self.values[0] * self.values[0])\n",
|
| 416 |
+
" self.values.pop(0)\n",
|
| 417 |
+
" else:\n",
|
| 418 |
+
" self.sum += value\n",
|
| 419 |
+
" self.sumSq += (value * value)\n",
|
| 420 |
+
"\n",
|
| 421 |
+
"\n",
|
| 422 |
+
" def getStat(self):\n",
|
| 423 |
+
" \"\"\"\n",
|
| 424 |
+
" calculate mean and std deviation \n",
|
| 425 |
+
" \"\"\"\n",
|
| 426 |
+
" mean = self.sum /self. count\n",
|
| 427 |
+
" t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
|
| 428 |
+
" sd = math.sqrt(t)\n",
|
| 429 |
+
" re = (mean, sd)\n",
|
| 430 |
+
" return re\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" def addGetStat(self,value):\n",
|
| 433 |
+
" \"\"\"\n",
|
| 434 |
+
" calculate mean and std deviation with new value added\n",
|
| 435 |
+
" \"\"\"\n",
|
| 436 |
+
" self.add(value)\n",
|
| 437 |
+
" re = self.getStat()\n",
|
| 438 |
+
" return re\n",
|
| 439 |
+
"\n",
|
| 440 |
+
" def getCount(self):\n",
|
| 441 |
+
" \"\"\"\n",
|
| 442 |
+
" return count\n",
|
| 443 |
+
" \"\"\"\n",
|
| 444 |
+
" return self.count\n",
|
| 445 |
+
"\n",
|
| 446 |
+
" def getCurSize(self):\n",
|
| 447 |
+
" \"\"\"\n",
|
| 448 |
+
" return count\n",
|
| 449 |
+
" \"\"\"\n",
|
| 450 |
+
" return len(self.values)\n",
|
| 451 |
+
"\n",
|
| 452 |
+
" def getState(self):\n",
|
| 453 |
+
" \"\"\"\n",
|
| 454 |
+
" return state\n",
|
| 455 |
+
" \"\"\"\n",
|
| 456 |
+
" s = (self.count, self.sum, self.sumSq)\n",
|
| 457 |
+
" return s\n",
|
| 458 |
+
"\n",
|
| 459 |
+
"\n",
|
| 460 |
+
"def basicStat(ldata):\n",
|
| 461 |
+
" \"\"\"\n",
|
| 462 |
+
" mean and std dev\n",
|
| 463 |
+
" Parameters\n",
|
| 464 |
+
" ldata : list of values\n",
|
| 465 |
+
" \"\"\"\n",
|
| 466 |
+
" m = statistics.mean(ldata)\n",
|
| 467 |
+
" s = statistics.stdev(ldata, xbar=m)\n",
|
| 468 |
+
" r = (m, s)\n",
|
| 469 |
+
" return r\n",
|
| 470 |
+
"\n",
|
| 471 |
+
"def getFileColumnStat(filePath, col, delem=\",\"):\n",
|
| 472 |
+
" \"\"\"\n",
|
| 473 |
+
" gets stats for a file column\n",
|
| 474 |
+
"\n",
|
| 475 |
+
" Parameters\n",
|
| 476 |
+
" filePath : file path\n",
|
| 477 |
+
" col : col index\n",
|
| 478 |
+
" delem : field delemter\n",
|
| 479 |
+
" \"\"\"\n",
|
| 480 |
+
" rs = RunningStat()\n",
|
| 481 |
+
" for rec in fileRecGen(filePath, delem):\n",
|
| 482 |
+
" va = float(rec[col])\n",
|
| 483 |
+
" rs.add(va)\n",
|
| 484 |
+
"\n",
|
| 485 |
+
" return rs.getStat()\n"
|
| 486 |
+
]
|
| 487 |
+
}
|
| 488 |
+
],
|
| 489 |
+
"metadata": {
|
| 490 |
+
"kernelspec": {
|
| 491 |
+
"display_name": "Python 3 (ipykernel)",
|
| 492 |
+
"language": "python",
|
| 493 |
+
"name": "python3"
|
| 494 |
+
},
|
| 495 |
+
"language_info": {
|
| 496 |
+
"codemirror_mode": {
|
| 497 |
+
"name": "ipython",
|
| 498 |
+
"version": 3
|
| 499 |
+
},
|
| 500 |
+
"file_extension": ".py",
|
| 501 |
+
"mimetype": "text/x-python",
|
| 502 |
+
"name": "python",
|
| 503 |
+
"nbconvert_exporter": "python",
|
| 504 |
+
"pygments_lexer": "ipython3",
|
| 505 |
+
"version": "3.9.12"
|
| 506 |
+
}
|
| 507 |
+
},
|
| 508 |
+
"nbformat": 4,
|
| 509 |
+
"nbformat_minor": 5
|
| 510 |
+
}
|
lib/.ipynb_checkpoints/tnn-checkpoint.ipynb
ADDED
|
@@ -0,0 +1,800 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "3853095d",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"import matplotlib.pyplot as plt\n",
|
| 13 |
+
"import numpy as np\n",
|
| 14 |
+
"import torch\n",
|
| 15 |
+
"from torch.autograd import Variable\n",
|
| 16 |
+
"from torch.utils.data import Dataset, TensorDataset\n",
|
| 17 |
+
"from torch.utils.data import DataLoader\n",
|
| 18 |
+
"import sklearn as sk\n",
|
| 19 |
+
"from sklearn.neighbors import KDTree\n",
|
| 20 |
+
"import matplotlib\n",
|
| 21 |
+
"import random\n",
|
| 22 |
+
"import jprops\n",
|
| 23 |
+
"from random import randint\n",
|
| 24 |
+
"import statistics\n",
|
| 25 |
+
"sys.path.append(os.path.abspath(\"../lib\"))\n",
|
| 26 |
+
"from util import *\n",
|
| 27 |
+
"from mlutil import *\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"\"\"\"\n",
|
| 30 |
+
"forward hook function\n",
|
| 31 |
+
"\"\"\"\n",
|
| 32 |
+
"intermedOut = {}\n",
|
| 33 |
+
"lvalues = list()\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"def hookFn(m, i, o):\n",
|
| 36 |
+
" \"\"\"\n",
|
| 37 |
+
" call back for latent values\n",
|
| 38 |
+
" \"\"\"\n",
|
| 39 |
+
" #intermedOut[m] = o\n",
|
| 40 |
+
" lv = o.data.cpu().numpy()\n",
|
| 41 |
+
" lv = lv[0].tolist()\n",
|
| 42 |
+
" lvalues.append(lv)\n",
|
| 43 |
+
" #print(lv)\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"def getLatValues():\n",
|
| 46 |
+
" \"\"\"\n",
|
| 47 |
+
" \"\"\"\n",
|
| 48 |
+
" return lvalues\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"class FeedForwardNetwork(torch.nn.Module):\n",
|
| 51 |
+
" def __init__(self, configFile, addDefValues=None):\n",
|
| 52 |
+
" \"\"\"\n",
|
| 53 |
+
" In the constructor we instantiate two nn.Linear modules and assign them as\n",
|
| 54 |
+
" member variables.\n",
|
| 55 |
+
"\n",
|
| 56 |
+
" Parameters\n",
|
| 57 |
+
" configFile : config file path\n",
|
| 58 |
+
" addDefValues : dictionary of additional default values\t\n",
|
| 59 |
+
" \"\"\"\n",
|
| 60 |
+
" defValues = dict() if addDefValues is None else addDefValues.copy()\n",
|
| 61 |
+
" defValues[\"common.mode\"] = (\"training\", None)\n",
|
| 62 |
+
" defValues[\"common.model.directory\"] = (\"model\", None)\n",
|
| 63 |
+
" defValues[\"common.model.file\"] = (None, None)\n",
|
| 64 |
+
" defValues[\"common.preprocessing\"] = (None, None)\n",
|
| 65 |
+
" defValues[\"common.scaling.method\"] = (\"zscale\", None)\n",
|
| 66 |
+
" defValues[\"common.scaling.minrows\"] = (50, None)\n",
|
| 67 |
+
" defValues[\"common.scaling.param.file\"] = (None, None)\n",
|
| 68 |
+
" defValues[\"common.verbose\"] = (False, None)\n",
|
| 69 |
+
" defValues[\"common.device\"] = (\"cpu\", None)\n",
|
| 70 |
+
" defValues[\"train.data.file\"] = (None, \"missing training data file\")\n",
|
| 71 |
+
" defValues[\"train.data.fields\"] = (None, \"missing training data field ordinals\")\n",
|
| 72 |
+
" defValues[\"train.data.feature.fields\"] = (None, \"missing training data feature field ordinals\")\n",
|
| 73 |
+
" defValues[\"train.data.out.fields\"] = (None, \"missing training data feature field ordinals\")\n",
|
| 74 |
+
" defValues[\"train.layer.data\"] = (None, \"missing layer data\")\n",
|
| 75 |
+
" defValues[\"train.input.size\"] = (None, None)\n",
|
| 76 |
+
" defValues[\"train.output.size\"] = (None, \"missing output size\")\n",
|
| 77 |
+
" defValues[\"train.batch.size\"] = (10, None)\n",
|
| 78 |
+
" defValues[\"train.loss.reduction\"] = (\"mean\", None)\n",
|
| 79 |
+
" defValues[\"train.num.iterations\"] = (500, None)\n",
|
| 80 |
+
" defValues[\"train.lossFn\"] = (\"mse\", None) \n",
|
| 81 |
+
" defValues[\"train.optimizer\"] = (\"sgd\", None) \n",
|
| 82 |
+
" defValues[\"train.opt.learning.rate\"] = (.0001, None)\n",
|
| 83 |
+
" defValues[\"train.opt.weight.decay\"] = (0, None) \n",
|
| 84 |
+
" defValues[\"train.opt.momentum\"] = (0, None) \n",
|
| 85 |
+
" defValues[\"train.opt.eps\"] = (1e-08, None) \n",
|
| 86 |
+
" defValues[\"train.opt.dampening\"] = (0, None) \n",
|
| 87 |
+
" defValues[\"train.opt.momentum.nesterov\"] = (False, None) \n",
|
| 88 |
+
" defValues[\"train.opt.betas\"] = ([0.9, 0.999], None) \n",
|
| 89 |
+
" defValues[\"train.opt.alpha\"] = (0.99, None) \n",
|
| 90 |
+
" defValues[\"train.save.model\"] = (False, None) \n",
|
| 91 |
+
" defValues[\"train.track.error\"] = (False, None) \n",
|
| 92 |
+
" defValues[\"train.epoch.intv\"] = (5, None) \n",
|
| 93 |
+
" defValues[\"train.batch.intv\"] = (5, None) \n",
|
| 94 |
+
" defValues[\"train.print.weights\"] = (False, None) \n",
|
| 95 |
+
" defValues[\"valid.data.file\"] = (None, None)\n",
|
| 96 |
+
" defValues[\"valid.accuracy.metric\"] = (None, None)\n",
|
| 97 |
+
" defValues[\"predict.data.file\"] = (None, None)\n",
|
| 98 |
+
" defValues[\"predict.use.saved.model\"] = (True, None)\n",
|
| 99 |
+
" defValues[\"predict.output\"] = (\"binary\", None)\n",
|
| 100 |
+
" defValues[\"predict.feat.pad.size\"] = (60, None)\n",
|
| 101 |
+
" defValues[\"predict.print.output\"] = (True, None)\n",
|
| 102 |
+
" defValues[\"calibrate.num.bins\"] = (10, None)\n",
|
| 103 |
+
" defValues[\"calibrate.pred.prob.thresh\"] = (0.5, None)\n",
|
| 104 |
+
" defValues[\"calibrate.num.nearest.neighbors\"] = (10, None)\n",
|
| 105 |
+
" self.config = Configuration(configFile, defValues)\n",
|
| 106 |
+
"\n",
|
| 107 |
+
" super(FeedForwardNetwork, self).__init__()\n",
|
| 108 |
+
"\n",
|
| 109 |
+
" def setConfigParam(self, name, value):\n",
|
| 110 |
+
" \"\"\"\n",
|
| 111 |
+
" set config param\n",
|
| 112 |
+
"\n",
|
| 113 |
+
" Parameters\n",
|
| 114 |
+
" name : config name\n",
|
| 115 |
+
" value : config value\n",
|
| 116 |
+
" \"\"\"\n",
|
| 117 |
+
" self.config.setParam(name, value)\n",
|
| 118 |
+
"\n",
|
| 119 |
+
" def getConfig(self):\n",
|
| 120 |
+
" \"\"\"\n",
|
| 121 |
+
" get config object\n",
|
| 122 |
+
" \"\"\"\n",
|
| 123 |
+
" return self.config\n",
|
| 124 |
+
"\n",
|
| 125 |
+
" def setVerbose(self, verbose):\n",
|
| 126 |
+
" self.verbose = verbose\n",
|
| 127 |
+
"\n",
|
| 128 |
+
" def buildModel(self):\n",
|
| 129 |
+
" \"\"\"\n",
|
| 130 |
+
" Loads configuration and builds the various piecess necessary for the model\n",
|
| 131 |
+
" \"\"\"\n",
|
| 132 |
+
" torch.manual_seed(9999)\n",
|
| 133 |
+
"\n",
|
| 134 |
+
" self.verbose = self.config.getBooleanConfig(\"common.verbose\")[0]\n",
|
| 135 |
+
" numinp = self.config.getIntConfig(\"train.input.size\")[0]\n",
|
| 136 |
+
" if numinp is None:\n",
|
| 137 |
+
" numinp = len(self.config.getIntListConfig(\"train.data.feature.fields\")[0])\n",
|
| 138 |
+
" #numOut = len(self.config.getStringConfig(\"train.data.out.fields\")[0].split(\",\"))\n",
|
| 139 |
+
" self.outputSize = self.config.getIntConfig(\"train.output.size\")[0]\n",
|
| 140 |
+
" self.batchSize = self.config.getIntConfig(\"train.batch.size\")[0]\n",
|
| 141 |
+
" #lossRed = self.config.getStringConfig(\"train.loss.reduction\")[0]\n",
|
| 142 |
+
" #learnRate = self.config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
|
| 143 |
+
" self.numIter = self.config.getIntConfig(\"train.num.iterations\")[0]\n",
|
| 144 |
+
" optimizer = self.config.getStringConfig(\"train.optimizer\")[0]\n",
|
| 145 |
+
" self.lossFnStr = self.config.getStringConfig(\"train.lossFn\")[0]\n",
|
| 146 |
+
" self.accMetric = self.config.getStringConfig(\"valid.accuracy.metric\")[0]\n",
|
| 147 |
+
" self.trackErr = self.config.getBooleanConfig(\"train.track.error\")[0]\n",
|
| 148 |
+
" self.batchIntv = self.config.getIntConfig(\"train.batch.intv\")[0]\n",
|
| 149 |
+
" self.restored = False\n",
|
| 150 |
+
" self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None\n",
|
| 151 |
+
"\n",
|
| 152 |
+
" #build network\n",
|
| 153 |
+
" layers = list()\n",
|
| 154 |
+
" ninp = numinp\n",
|
| 155 |
+
" trData = self.config.getStringConfig(\"train.layer.data\")[0].split(\",\")\n",
|
| 156 |
+
" for ld in trData:\n",
|
| 157 |
+
" lde = ld.split(\":\")\n",
|
| 158 |
+
" assert len(lde) == 5, \"expecting 5 items for layer data\"\n",
|
| 159 |
+
"\n",
|
| 160 |
+
" #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction\n",
|
| 161 |
+
" nunit = int(lde[0])\n",
|
| 162 |
+
" actStr = lde[1]\n",
|
| 163 |
+
" act = FeedForwardNetwork.createActivation(actStr) if actStr != \"none\" else None\n",
|
| 164 |
+
" bnorm = lde[2] == \"true\"\n",
|
| 165 |
+
" afterAct = lde[3] == \"true\"\n",
|
| 166 |
+
" dpr = float(lde[4])\n",
|
| 167 |
+
"\n",
|
| 168 |
+
" layers.append(torch.nn.Linear(ninp, nunit))\t\t\t\n",
|
| 169 |
+
" if bnorm:\n",
|
| 170 |
+
" #with batch norm\n",
|
| 171 |
+
" if afterAct:\n",
|
| 172 |
+
" safeAppend(layers, act)\n",
|
| 173 |
+
" layers.append(torch.nn.BatchNorm1d(nunit))\n",
|
| 174 |
+
" else:\n",
|
| 175 |
+
" layers.append(torch.nn.BatchNorm1d(nunit))\n",
|
| 176 |
+
" safeAppend(layers, act)\n",
|
| 177 |
+
" else:\n",
|
| 178 |
+
" #without batch norm\n",
|
| 179 |
+
" safeAppend(layers, act)\n",
|
| 180 |
+
"\n",
|
| 181 |
+
" if dpr > 0:\n",
|
| 182 |
+
" layers.append(torch.nn.Dropout(dpr))\n",
|
| 183 |
+
" ninp = nunit\n",
|
| 184 |
+
"\n",
|
| 185 |
+
" self.layers = torch.nn.Sequential(*layers)\t\n",
|
| 186 |
+
"\n",
|
| 187 |
+
" self.device = FeedForwardNetwork.getDevice(self)\n",
|
| 188 |
+
"\n",
|
| 189 |
+
" #training data\n",
|
| 190 |
+
" dataFile = self.config.getStringConfig(\"train.data.file\")[0]\n",
|
| 191 |
+
" (featData, outData) = FeedForwardNetwork.prepData(self, dataFile)\n",
|
| 192 |
+
" self.featData = torch.from_numpy(featData)\n",
|
| 193 |
+
" self.outData = torch.from_numpy(outData)\n",
|
| 194 |
+
"\n",
|
| 195 |
+
" #validation data\n",
|
| 196 |
+
" dataFile = self.config.getStringConfig(\"valid.data.file\")[0]\n",
|
| 197 |
+
" (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)\n",
|
| 198 |
+
" self.validFeatData = torch.from_numpy(featDataV)\n",
|
| 199 |
+
" self.validOutData = torch.from_numpy(outDataV)\n",
|
| 200 |
+
"\n",
|
| 201 |
+
" # loss function and optimizer\n",
|
| 202 |
+
" self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)\n",
|
| 203 |
+
" self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)\n",
|
| 204 |
+
"\n",
|
| 205 |
+
" self.yPred = None\n",
|
| 206 |
+
" self.restored = False\n",
|
| 207 |
+
"\n",
|
| 208 |
+
" #mode to device\n",
|
| 209 |
+
" self.device = FeedForwardNetwork.getDevice(self)\t\n",
|
| 210 |
+
" self.featData = self.featData.to(self.device)\n",
|
| 211 |
+
" self.outData = self.outData.to(self.device)\n",
|
| 212 |
+
" self.validFeatData = self.validFeatData.to(self.device)\n",
|
| 213 |
+
" self.to(self.device)\n",
|
| 214 |
+
"\n",
|
| 215 |
+
" @staticmethod\n",
|
| 216 |
+
" def getDevice(model):\n",
|
| 217 |
+
" \"\"\"\n",
|
| 218 |
+
" gets device\n",
|
| 219 |
+
"\n",
|
| 220 |
+
" Parameters\n",
|
| 221 |
+
" model : torch model\n",
|
| 222 |
+
" \"\"\"\n",
|
| 223 |
+
" devType = model.config.getStringConfig(\"common.device\")[0]\n",
|
| 224 |
+
" if devType == \"cuda\":\n",
|
| 225 |
+
" if torch.cuda.is_available():\n",
|
| 226 |
+
" device = torch.device(\"cuda\")\n",
|
| 227 |
+
" else:\n",
|
| 228 |
+
" exitWithMsg(\"cuda not available\")\n",
|
| 229 |
+
" else:\n",
|
| 230 |
+
" device = torch.device(\"cpu\")\n",
|
| 231 |
+
" return device\n",
|
| 232 |
+
"\n",
|
| 233 |
+
" def setValidationData(self, dataSource, prep=True):\n",
|
| 234 |
+
" \"\"\"\n",
|
| 235 |
+
" sets validation data\n",
|
| 236 |
+
"\n",
|
| 237 |
+
" Parameters\n",
|
| 238 |
+
" dataSource : data source str if file path or 2D array\n",
|
| 239 |
+
" prep : if True load and prepare \n",
|
| 240 |
+
" \"\"\"\n",
|
| 241 |
+
" if prep:\n",
|
| 242 |
+
" (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)\n",
|
| 243 |
+
" self.validFeatData = torch.from_numpy(featDataV)\n",
|
| 244 |
+
" self.validOutData = outDataV\n",
|
| 245 |
+
" else:\n",
|
| 246 |
+
" self.validFeatData = torch.from_numpy(dataSource[0])\n",
|
| 247 |
+
" self.validOutData = dataSource[1]\t\t\n",
|
| 248 |
+
"\n",
|
| 249 |
+
" self.validFeatData = self.validFeatData.to(self.device)\n",
|
| 250 |
+
"\n",
|
| 251 |
+
" @staticmethod\n",
|
| 252 |
+
" def createActivation(actName):\n",
|
| 253 |
+
" \"\"\"\n",
|
| 254 |
+
" create activation\n",
|
| 255 |
+
"\n",
|
| 256 |
+
" Parameters\n",
|
| 257 |
+
" actName : activation name\n",
|
| 258 |
+
" \"\"\"\n",
|
| 259 |
+
" if actName is None:\n",
|
| 260 |
+
" activation = None\n",
|
| 261 |
+
" elif actName == \"relu\":\n",
|
| 262 |
+
" activation = torch.nn.ReLU()\n",
|
| 263 |
+
" elif actName == \"tanh\":\n",
|
| 264 |
+
" activation = torch.nn.Tanh()\n",
|
| 265 |
+
" elif actName == \"sigmoid\":\n",
|
| 266 |
+
" activation = torch.nn.Sigmoid()\n",
|
| 267 |
+
" elif actName == \"softmax\":\n",
|
| 268 |
+
" activation = torch.nn.Softmax(dim=1)\n",
|
| 269 |
+
" else:\n",
|
| 270 |
+
" exitWithMsg(\"invalid activation function name \" + actName)\n",
|
| 271 |
+
" return activation\n",
|
| 272 |
+
"\n",
|
| 273 |
+
" @staticmethod\n",
|
| 274 |
+
" def createLossFunction(model, lossFnName):\n",
|
| 275 |
+
" \"\"\"\n",
|
| 276 |
+
" create loss function\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" Parameters\n",
|
| 279 |
+
" lossFnName : loss function name\n",
|
| 280 |
+
" \"\"\"\n",
|
| 281 |
+
" config = model.config\n",
|
| 282 |
+
" lossRed = config.getStringConfig(\"train.loss.reduction\")[0]\n",
|
| 283 |
+
" if lossFnName == \"ltwo\" or lossFnName == \"mse\":\n",
|
| 284 |
+
" lossFunc = torch.nn.MSELoss(reduction=lossRed)\n",
|
| 285 |
+
" elif lossFnName == \"ce\":\n",
|
| 286 |
+
" lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)\n",
|
| 287 |
+
" elif lossFnName == \"lone\" or lossFnName == \"mae\":\n",
|
| 288 |
+
" lossFunc = torch.nn.L1Loss(reduction=lossRed)\n",
|
| 289 |
+
" elif lossFnName == \"bce\":\n",
|
| 290 |
+
" lossFunc = torch.nn.BCELoss(reduction=lossRed)\n",
|
| 291 |
+
" elif lossFnName == \"bcel\":\n",
|
| 292 |
+
" lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)\n",
|
| 293 |
+
" elif lossFnName == \"sm\":\n",
|
| 294 |
+
" lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)\n",
|
| 295 |
+
" elif lossFnName == \"mlsm\":\n",
|
| 296 |
+
" lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)\n",
|
| 297 |
+
" else:\n",
|
| 298 |
+
" exitWithMsg(\"invalid loss function name \" + lossFnName)\n",
|
| 299 |
+
" return lossFunc\n",
|
| 300 |
+
"\n",
|
| 301 |
+
" @staticmethod\n",
|
| 302 |
+
" def createOptimizer(model, optName):\n",
|
| 303 |
+
" \"\"\"\n",
|
| 304 |
+
" create optimizer\n",
|
| 305 |
+
"\n",
|
| 306 |
+
" Parameters\n",
|
| 307 |
+
" optName : optimizer name\n",
|
| 308 |
+
" \"\"\"\n",
|
| 309 |
+
" config = model.config\n",
|
| 310 |
+
" learnRate = config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
|
| 311 |
+
" weightDecay = config.getFloatConfig(\"train.opt.weight.decay\")[0]\n",
|
| 312 |
+
" momentum = config.getFloatConfig(\"train.opt.momentum\")[0]\n",
|
| 313 |
+
" eps = config.getFloatConfig(\"train.opt.eps\")[0]\n",
|
| 314 |
+
" if optName == \"sgd\":\n",
|
| 315 |
+
" dampening = config.getFloatConfig(\"train.opt.dampening\")[0]\n",
|
| 316 |
+
" momentumNesterov = config.getBooleanConfig(\"train.opt.momentum.nesterov\")[0]\n",
|
| 317 |
+
" optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum, \n",
|
| 318 |
+
" dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)\n",
|
| 319 |
+
" elif optName == \"adam\":\n",
|
| 320 |
+
" betas = config.getFloatListConfig(\"train.opt.betas\")[0]\n",
|
| 321 |
+
" betas = (betas[0], betas[1]) \n",
|
| 322 |
+
" optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,\n",
|
| 323 |
+
" weight_decay=weightDecay)\n",
|
| 324 |
+
" elif optName == \"rmsprop\":\n",
|
| 325 |
+
" alpha = config.getFloatConfig(\"train.opt.alpha\")[0]\n",
|
| 326 |
+
" optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,\n",
|
| 327 |
+
" eps=eps, weight_decay=weightDecay, momentum=momentum)\n",
|
| 328 |
+
" else:\n",
|
| 329 |
+
" exitWithMsg(\"invalid optimizer name \" + optName)\n",
|
| 330 |
+
" return optimizer\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"\n",
|
| 333 |
+
" def forward(self, x):\n",
|
| 334 |
+
" \"\"\"\n",
|
| 335 |
+
" In the forward function we accept a Tensor of input data and we must return\n",
|
| 336 |
+
" a Tensor of output data. We can use Modules defined in the constructor as\n",
|
| 337 |
+
" well as arbitrary (differentiable) operations on Tensors.\n",
|
| 338 |
+
"\n",
|
| 339 |
+
" Parameters\n",
|
| 340 |
+
" x : data batch\n",
|
| 341 |
+
" \"\"\"\n",
|
| 342 |
+
" y = self.layers(x)\t\n",
|
| 343 |
+
" return y\n",
|
| 344 |
+
"\n",
|
| 345 |
+
" @staticmethod\n",
|
| 346 |
+
" def addForwardHook(model, l, cl = 0):\n",
|
| 347 |
+
" \"\"\"\n",
|
| 348 |
+
" register forward hooks\n",
|
| 349 |
+
"\n",
|
| 350 |
+
" Parameters\n",
|
| 351 |
+
" l : \n",
|
| 352 |
+
" cl :\n",
|
| 353 |
+
" \"\"\"\n",
|
| 354 |
+
" for name, layer in model._modules.items():\n",
|
| 355 |
+
" #If it is a sequential, don't register a hook on it\n",
|
| 356 |
+
" # but recursively register hook on all it's module children\n",
|
| 357 |
+
" print(str(cl) + \" : \" + name)\n",
|
| 358 |
+
" if isinstance(layer, torch.nn.Sequential):\n",
|
| 359 |
+
" FeedForwardNetwork.addForwardHook(layer, l, cl)\n",
|
| 360 |
+
" else:\n",
|
| 361 |
+
" #\t it's a non sequential. Register a hook\n",
|
| 362 |
+
" if cl == l:\n",
|
| 363 |
+
" print(\"setting hook at layer \" + str(l))\n",
|
| 364 |
+
" layer.register_forward_hook(hookFn)\n",
|
| 365 |
+
" cl += 1\n",
|
| 366 |
+
"\n",
|
| 367 |
+
" @staticmethod\n",
|
| 368 |
+
" def prepData(model, dataSource, includeOutFld=True):\n",
|
| 369 |
+
" \"\"\"\n",
|
| 370 |
+
" loads and prepares data\n",
|
| 371 |
+
"\n",
|
| 372 |
+
" Parameters\n",
|
| 373 |
+
" dataSource : data source str if file path or 2D array\n",
|
| 374 |
+
" includeOutFld : True if target freld to be included\n",
|
| 375 |
+
" \"\"\"\n",
|
| 376 |
+
" # parameters\n",
|
| 377 |
+
" fieldIndices = model.config.getIntListConfig(\"train.data.fields\")[0]\n",
|
| 378 |
+
" featFieldIndices = model.config.getIntListConfig(\"train.data.feature.fields\")[0]\n",
|
| 379 |
+
"\n",
|
| 380 |
+
" #all data and feature data\n",
|
| 381 |
+
" isDataFile = isinstance(dataSource, str)\n",
|
| 382 |
+
" selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]\n",
|
| 383 |
+
" if isDataFile: \n",
|
| 384 |
+
" #source file path \n",
|
| 385 |
+
" (data, featData) = loadDataFile(dataSource, \",\", selFieldIndices, featFieldIndices)\n",
|
| 386 |
+
" else:\n",
|
| 387 |
+
" # tabular data\n",
|
| 388 |
+
" data = tableSelFieldsFilter(dataSource, selFieldIndices)\n",
|
| 389 |
+
" featData = tableSelFieldsFilter(data, featFieldIndices)\n",
|
| 390 |
+
" #print(featData)\n",
|
| 391 |
+
" featData = np.array(featData)\n",
|
| 392 |
+
"\n",
|
| 393 |
+
" if (model.config.getStringConfig(\"common.preprocessing\")[0] == \"scale\"):\n",
|
| 394 |
+
" scalingMethod = model.config.getStringConfig(\"common.scaling.method\")[0]\n",
|
| 395 |
+
"\n",
|
| 396 |
+
" #scale only if there are enough rows\n",
|
| 397 |
+
" nrow = featData.shape[0]\n",
|
| 398 |
+
" minrows = model.config.getIntConfig(\"common.scaling.minrows\")[0]\n",
|
| 399 |
+
" if nrow > minrows:\n",
|
| 400 |
+
" #in place scaling\n",
|
| 401 |
+
" featData = scaleData(featData, scalingMethod)\n",
|
| 402 |
+
" else:\n",
|
| 403 |
+
" #use pre computes scaling parameters\n",
|
| 404 |
+
" spFile = model.config.getStringConfig(\"common.scaling.param.file\")[0]\n",
|
| 405 |
+
" if spFile is None:\n",
|
| 406 |
+
" exitWithMsg(\"for small data sets pre computed scaling parameters need to provided\")\n",
|
| 407 |
+
" scParams = restoreObject(spFile)\n",
|
| 408 |
+
" featData = scaleDataWithParams(featData, scalingMethod, scParams)\n",
|
| 409 |
+
" featData = np.array(featData)\n",
|
| 410 |
+
"\n",
|
| 411 |
+
" # target data\n",
|
| 412 |
+
" if includeOutFld:\n",
|
| 413 |
+
" outFieldIndices = model.config.getStringConfig(\"train.data.out.fields\")[0]\n",
|
| 414 |
+
" outFieldIndices = strToIntArray(outFieldIndices, \",\")\n",
|
| 415 |
+
" if isDataFile:\n",
|
| 416 |
+
" outData = data[:,outFieldIndices]\n",
|
| 417 |
+
" else:\n",
|
| 418 |
+
" outData = tableSelFieldsFilter(data, outFieldIndices)\n",
|
| 419 |
+
" outData = np.array(outData)\n",
|
| 420 |
+
" foData = (featData.astype(np.float32), outData.astype(np.float32))\n",
|
| 421 |
+
" else:\n",
|
| 422 |
+
" foData = featData.astype(np.float32)\n",
|
| 423 |
+
" return foData\n",
|
| 424 |
+
"\n",
|
| 425 |
+
" @staticmethod\n",
|
| 426 |
+
" def saveCheckpt(model):\n",
|
| 427 |
+
" \"\"\"\n",
|
| 428 |
+
" checkpoints model\n",
|
| 429 |
+
"\n",
|
| 430 |
+
" Parameters\n",
|
| 431 |
+
" model : torch model\n",
|
| 432 |
+
" \"\"\"\n",
|
| 433 |
+
" print(\"..saving model checkpoint\")\n",
|
| 434 |
+
" modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
|
| 435 |
+
" assert os.path.exists(modelDirectory), \"model save directory does not exist\"\n",
|
| 436 |
+
" modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
|
| 437 |
+
" filepath = os.path.join(modelDirectory, modelFile)\n",
|
| 438 |
+
" state = {\"state_dict\": model.state_dict(), \"optim_dict\": model.optimizer.state_dict()}\n",
|
| 439 |
+
" torch.save(state, filepath)\n",
|
| 440 |
+
" if model.verbose:\n",
|
| 441 |
+
" print(\"model saved\")\n",
|
| 442 |
+
"\n",
|
| 443 |
+
" @staticmethod\n",
|
| 444 |
+
" def restoreCheckpt(model, loadOpt=False):\n",
|
| 445 |
+
" \"\"\"\n",
|
| 446 |
+
" restored checkpointed model\n",
|
| 447 |
+
"\n",
|
| 448 |
+
" Parameters\n",
|
| 449 |
+
" model : torch model\n",
|
| 450 |
+
" loadOpt : True if optimizer to be loaded\n",
|
| 451 |
+
" \"\"\"\n",
|
| 452 |
+
" if not model.restored:\n",
|
| 453 |
+
" print(\"..restoring model checkpoint\")\n",
|
| 454 |
+
" modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
|
| 455 |
+
" modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
|
| 456 |
+
" filepath = os.path.join(modelDirectory, modelFile)\n",
|
| 457 |
+
" assert os.path.exists(filepath), \"model save file does not exist\"\n",
|
| 458 |
+
" checkpoint = torch.load(filepath)\n",
|
| 459 |
+
" model.load_state_dict(checkpoint[\"state_dict\"])\n",
|
| 460 |
+
" model.to(model.device)\n",
|
| 461 |
+
" if loadOpt:\n",
|
| 462 |
+
" model.optimizer.load_state_dict(checkpoint[\"optim_dict\"])\n",
|
| 463 |
+
" model.restored = True\n",
|
| 464 |
+
"\n",
|
| 465 |
+
" @staticmethod\n",
|
| 466 |
+
" def processClassifOutput(yPred, config):\n",
|
| 467 |
+
" \"\"\"\n",
|
| 468 |
+
" extracts probability label 1 or label with highest probability\n",
|
| 469 |
+
"\n",
|
| 470 |
+
" Parameters\n",
|
| 471 |
+
" yPred : predicted output\n",
|
| 472 |
+
" config : config object\n",
|
| 473 |
+
" \"\"\"\n",
|
| 474 |
+
" outType = config.getStringConfig(\"predict.output\")[0]\n",
|
| 475 |
+
" if outType == \"prob\":\n",
|
| 476 |
+
" outputSize = config.getIntConfig(\"train.output.size\")[0]\n",
|
| 477 |
+
" if outputSize == 2:\n",
|
| 478 |
+
" #return prob of pos class for binary classifier \n",
|
| 479 |
+
" yPred = yPred[:, 1]\n",
|
| 480 |
+
" else:\n",
|
| 481 |
+
" #return class value and probability for multi classifier \n",
|
| 482 |
+
" yCl = np.argmax(yPred, axis=1)\n",
|
| 483 |
+
" yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))\n",
|
| 484 |
+
" yPred = zip(yCl, yPred)\n",
|
| 485 |
+
" else:\n",
|
| 486 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 487 |
+
" return yPred\n",
|
| 488 |
+
"\n",
|
| 489 |
+
" @staticmethod\n",
|
| 490 |
+
" def printPrediction(yPred, config, dataSource):\n",
|
| 491 |
+
" \"\"\"\n",
|
| 492 |
+
" prints input feature data and prediction\n",
|
| 493 |
+
"\n",
|
| 494 |
+
" Parameters\n",
|
| 495 |
+
" yPred : predicted output\n",
|
| 496 |
+
" config : config object\n",
|
| 497 |
+
" dataSource : data source str if file path or 2D array\n",
|
| 498 |
+
" \"\"\"\n",
|
| 499 |
+
" #prDataFilePath = config.getStringConfig(\"predict.data.file\")[0]\n",
|
| 500 |
+
" padWidth = config.getIntConfig(\"predict.feat.pad.size\")[0]\n",
|
| 501 |
+
" i = 0\n",
|
| 502 |
+
" if type(dataSource) == str:\n",
|
| 503 |
+
" for rec in fileRecGen(dataSource, \",\"):\n",
|
| 504 |
+
" feat = (\",\".join(rec)).ljust(padWidth, \" \")\n",
|
| 505 |
+
" rec = feat + \"\\t\" + str(yPred[i])\n",
|
| 506 |
+
" print(rec)\n",
|
| 507 |
+
" i += 1\n",
|
| 508 |
+
" else:\n",
|
| 509 |
+
" for rec in dataSource:\n",
|
| 510 |
+
" srec = toStrList(rec, 6)\n",
|
| 511 |
+
" feat = (\",\".join(srec)).ljust(padWidth, \" \")\n",
|
| 512 |
+
" srec = feat + \"\\t\" + str(yPred[i])\n",
|
| 513 |
+
" print(srec)\n",
|
| 514 |
+
" i += 1\n",
|
| 515 |
+
"\n",
|
| 516 |
+
"\n",
|
| 517 |
+
" @staticmethod\n",
|
| 518 |
+
" def allTrain(model):\n",
|
| 519 |
+
" \"\"\"\n",
|
| 520 |
+
" train with all data\n",
|
| 521 |
+
"\n",
|
| 522 |
+
" Parameters\n",
|
| 523 |
+
" model : torch model\n",
|
| 524 |
+
" \"\"\"\n",
|
| 525 |
+
" # train mode\n",
|
| 526 |
+
" model.train()\n",
|
| 527 |
+
" for t in range(model.numIter):\n",
|
| 528 |
+
"\n",
|
| 529 |
+
"\n",
|
| 530 |
+
" # Forward pass: Compute predicted y by passing x to the model\n",
|
| 531 |
+
" yPred = model(model.featData)\n",
|
| 532 |
+
"\n",
|
| 533 |
+
" # Compute and print loss\n",
|
| 534 |
+
" loss = model.lossFn(yPred, model.outData)\n",
|
| 535 |
+
" if model.verbose and t % 50 == 0:\n",
|
| 536 |
+
" print(\"epoch {} loss {:.6f}\".format(t, loss.item()))\n",
|
| 537 |
+
"\n",
|
| 538 |
+
" # Zero gradients, perform a backward pass, and update the weights.\n",
|
| 539 |
+
" model.optimizer.zero_grad()\n",
|
| 540 |
+
" loss.backward()\n",
|
| 541 |
+
" model.optimizer.step() \t\n",
|
| 542 |
+
"\n",
|
| 543 |
+
" #validate\n",
|
| 544 |
+
" model.eval()\n",
|
| 545 |
+
" yPred = model(model.validFeatData)\n",
|
| 546 |
+
" yPred = yPred.data.cpu().numpy()\n",
|
| 547 |
+
" yActual = model.validOutData\n",
|
| 548 |
+
" if model.verbose:\n",
|
| 549 |
+
" result = np.concatenate((yPred, yActual), axis = 1)\n",
|
| 550 |
+
" print(\"predicted actual\")\n",
|
| 551 |
+
" print(result)\n",
|
| 552 |
+
"\n",
|
| 553 |
+
" score = perfMetric(model.accMetric, yActual, yPred)\n",
|
| 554 |
+
" print(formatFloat(3, score, \"perf score\"))\n",
|
| 555 |
+
" return score\n",
|
| 556 |
+
"\n",
|
| 557 |
+
" @staticmethod\n",
|
| 558 |
+
" def batchTrain(model):\n",
|
| 559 |
+
" \"\"\"\n",
|
| 560 |
+
" train with batch data\n",
|
| 561 |
+
"\n",
|
| 562 |
+
" Parameters\n",
|
| 563 |
+
" model : torch model\n",
|
| 564 |
+
" \"\"\"\n",
|
| 565 |
+
" model.restored = False\n",
|
| 566 |
+
" trainData = TensorDataset(model.featData, model.outData)\n",
|
| 567 |
+
" trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)\n",
|
| 568 |
+
" epochIntv = model.config.getIntConfig(\"train.epoch.intv\")[0]\n",
|
| 569 |
+
"\n",
|
| 570 |
+
" # train mode\n",
|
| 571 |
+
" model.train()\n",
|
| 572 |
+
"\n",
|
| 573 |
+
" if model.trackErr:\n",
|
| 574 |
+
" trErr = list()\n",
|
| 575 |
+
" vaErr = list()\n",
|
| 576 |
+
" #epoch\n",
|
| 577 |
+
" for t in range(model.numIter):\n",
|
| 578 |
+
" #batch\n",
|
| 579 |
+
" b = 0\n",
|
| 580 |
+
" epochLoss = 0.0\n",
|
| 581 |
+
" for xBatch, yBatch in trainDataLoader:\n",
|
| 582 |
+
"\n",
|
| 583 |
+
" # Forward pass: Compute predicted y by passing x to the model\n",
|
| 584 |
+
" xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)\n",
|
| 585 |
+
" yPred = model(xBatch)\n",
|
| 586 |
+
"\n",
|
| 587 |
+
" # Compute and print loss\n",
|
| 588 |
+
" loss = model.lossFn(yPred, yBatch)\n",
|
| 589 |
+
" if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:\n",
|
| 590 |
+
" print(\"epoch {} batch {} loss {:.6f}\".format(t, b, loss.item()))\n",
|
| 591 |
+
"\n",
|
| 592 |
+
" if model.trackErr and model.batchIntv == 0:\n",
|
| 593 |
+
" epochLoss += loss.item()\n",
|
| 594 |
+
"\n",
|
| 595 |
+
" #error tracking at batch level\n",
|
| 596 |
+
" if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:\n",
|
| 597 |
+
" trErr.append(loss.item())\n",
|
| 598 |
+
" vloss = FeedForwardNetwork.evaluateModel(model)\n",
|
| 599 |
+
" vaErr.append(vloss)\n",
|
| 600 |
+
"\n",
|
| 601 |
+
" # Zero gradients, perform a backward pass, and update the weights.\n",
|
| 602 |
+
" model.optimizer.zero_grad()\n",
|
| 603 |
+
" loss.backward()\n",
|
| 604 |
+
" model.optimizer.step() \t\n",
|
| 605 |
+
" b += 1\n",
|
| 606 |
+
"\n",
|
| 607 |
+
" #error tracking at epoch level\n",
|
| 608 |
+
" if model.trackErr and model.batchIntv == 0:\n",
|
| 609 |
+
" epochLoss /= len(trainDataLoader)\n",
|
| 610 |
+
" trErr.append(epochLoss)\n",
|
| 611 |
+
" vloss = FeedForwardNetwork.evaluateModel(model)\n",
|
| 612 |
+
" vaErr.append(vloss)\n",
|
| 613 |
+
"\n",
|
| 614 |
+
" #validate\n",
|
| 615 |
+
" model.eval()\n",
|
| 616 |
+
" yPred = model(model.validFeatData)\n",
|
| 617 |
+
" yPred = yPred.data.cpu().numpy()\n",
|
| 618 |
+
" yActual = model.validOutData\n",
|
| 619 |
+
" if model.verbose:\n",
|
| 620 |
+
" vsize = yPred.shape[0]\n",
|
| 621 |
+
" print(\"\\npredicted \\t\\t actual\")\n",
|
| 622 |
+
" for i in range(vsize):\n",
|
| 623 |
+
" print(str(yPred[i]) + \"\\t\" + str(yActual[i]))\n",
|
| 624 |
+
"\n",
|
| 625 |
+
" score = perfMetric(model.accMetric, yActual, yPred)\n",
|
| 626 |
+
" print(yActual)\n",
|
| 627 |
+
" print(yPred)\n",
|
| 628 |
+
" print(formatFloat(3, score, \"perf score\"))\n",
|
| 629 |
+
"\n",
|
| 630 |
+
" #save\n",
|
| 631 |
+
" modelSave = model.config.getBooleanConfig(\"train.model.save\")[0]\n",
|
| 632 |
+
" if modelSave:\n",
|
| 633 |
+
" FeedForwardNetwork.saveCheckpt(model)\n",
|
| 634 |
+
"\n",
|
| 635 |
+
" if model.trackErr:\n",
|
| 636 |
+
" FeedForwardNetwork.errorPlot(model, trErr, vaErr)\n",
|
| 637 |
+
"\n",
|
| 638 |
+
" if model.config.getBooleanConfig(\"train.print.weights\")[0]:\n",
|
| 639 |
+
" print(\"model weights\")\n",
|
| 640 |
+
" for param in model.parameters():\n",
|
| 641 |
+
" print(param.data)\n",
|
| 642 |
+
" return score\n",
|
| 643 |
+
"\n",
|
| 644 |
+
" @staticmethod\n",
|
| 645 |
+
" def errorPlot(model, trErr, vaErr):\n",
|
| 646 |
+
" \"\"\"\n",
|
| 647 |
+
" plot errors\n",
|
| 648 |
+
"\n",
|
| 649 |
+
" Parameters\n",
|
| 650 |
+
" trErr : training error list\t\n",
|
| 651 |
+
" vaErr : validation error list\t\n",
|
| 652 |
+
" \"\"\"\n",
|
| 653 |
+
" x = np.arange(len(trErr))\n",
|
| 654 |
+
" plt.plot(x,trErr,label = \"training error\")\n",
|
| 655 |
+
" plt.plot(x,vaErr,label = \"validation error\")\n",
|
| 656 |
+
" plt.xlabel(\"iteration\")\n",
|
| 657 |
+
" plt.ylabel(\"error\")\n",
|
| 658 |
+
" plt.legend([\"training error\", \"validation error\"], loc='upper left')\n",
|
| 659 |
+
" plt.show()\n",
|
| 660 |
+
"\n",
|
| 661 |
+
" @staticmethod\n",
|
| 662 |
+
" def modelPredict(model, dataSource = None):\n",
|
| 663 |
+
" \"\"\"\n",
|
| 664 |
+
" predict\n",
|
| 665 |
+
"\n",
|
| 666 |
+
" Parameters\n",
|
| 667 |
+
" model : torch model\n",
|
| 668 |
+
" dataSource : data source\n",
|
| 669 |
+
" \"\"\"\n",
|
| 670 |
+
" #train or restore model\n",
|
| 671 |
+
" useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
|
| 672 |
+
" if useSavedModel:\n",
|
| 673 |
+
" FeedForwardNetwork.restoreCheckpt(model)\n",
|
| 674 |
+
" else:\n",
|
| 675 |
+
" FeedForwardNetwork.batchTrain(model) \n",
|
| 676 |
+
"\n",
|
| 677 |
+
" #predict\n",
|
| 678 |
+
" if dataSource is None:\n",
|
| 679 |
+
" dataSource = model.config.getStringConfig(\"predict.data.file\")[0]\n",
|
| 680 |
+
" featData = FeedForwardNetwork.prepData(model, dataSource, False)\n",
|
| 681 |
+
" #print(featData)\n",
|
| 682 |
+
" featData = torch.from_numpy(featData)\n",
|
| 683 |
+
" featData = featData.to(model.device)\n",
|
| 684 |
+
"\n",
|
| 685 |
+
" model.eval()\n",
|
| 686 |
+
" yPred = model(featData)\n",
|
| 687 |
+
" yPred = yPred.data.cpu().numpy()\n",
|
| 688 |
+
" #print(yPred)\n",
|
| 689 |
+
"\n",
|
| 690 |
+
" if model.outputSize >= 2:\n",
|
| 691 |
+
" #classification\n",
|
| 692 |
+
" yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)\n",
|
| 693 |
+
"\n",
|
| 694 |
+
" # print prediction\n",
|
| 695 |
+
" if model.config.getBooleanConfig(\"predict.print.output\")[0]:\n",
|
| 696 |
+
" FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)\n",
|
| 697 |
+
"\n",
|
| 698 |
+
" return yPred\n",
|
| 699 |
+
"\n",
|
| 700 |
+
" def predict(self, dataSource = None):\n",
|
| 701 |
+
" \"\"\"\n",
|
| 702 |
+
" predict\n",
|
| 703 |
+
"\n",
|
| 704 |
+
" Parameters\n",
|
| 705 |
+
" dataSource : data source\n",
|
| 706 |
+
" \"\"\"\n",
|
| 707 |
+
" return FeedForwardNetwork.modelPredict(self, dataSource)\n",
|
| 708 |
+
"\n",
|
| 709 |
+
" @staticmethod\n",
|
| 710 |
+
" def evaluateModel(model):\n",
|
| 711 |
+
" \"\"\"\n",
|
| 712 |
+
" evaluate model\n",
|
| 713 |
+
"\n",
|
| 714 |
+
" Parameters\n",
|
| 715 |
+
" model : torch model\n",
|
| 716 |
+
" \"\"\"\n",
|
| 717 |
+
" model.eval()\n",
|
| 718 |
+
" with torch.no_grad():\n",
|
| 719 |
+
" yPred = model(model.validFeatData)\n",
|
| 720 |
+
" #yPred = yPred.data.cpu().numpy()\n",
|
| 721 |
+
" yActual = model.validOutData\n",
|
| 722 |
+
" score = model.lossFn(yPred, yActual).item()\n",
|
| 723 |
+
" model.train()\n",
|
| 724 |
+
" return score\n",
|
| 725 |
+
"\n",
|
| 726 |
+
" @staticmethod\n",
|
| 727 |
+
" def prepValidate(model, dataSource=None):\n",
|
| 728 |
+
" \"\"\"\n",
|
| 729 |
+
" prepare for validation\n",
|
| 730 |
+
"\n",
|
| 731 |
+
" Parameters\n",
|
| 732 |
+
" model : torch model\n",
|
| 733 |
+
" dataSource : data source\n",
|
| 734 |
+
" \"\"\"\n",
|
| 735 |
+
" #train or restore model\n",
|
| 736 |
+
" if not model.restored:\n",
|
| 737 |
+
" useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
|
| 738 |
+
" if useSavedModel:\n",
|
| 739 |
+
" FeedForwardNetwork.restoreCheckpt(model)\n",
|
| 740 |
+
" else:\n",
|
| 741 |
+
" FeedForwardNetwork.batchTrain(model)\n",
|
| 742 |
+
" model.restored = True\n",
|
| 743 |
+
"\n",
|
| 744 |
+
" if \tdataSource is not None:\n",
|
| 745 |
+
" model.setValidationData(dataSource)\n",
|
| 746 |
+
"\n",
|
| 747 |
+
" @staticmethod\n",
|
| 748 |
+
" def validateModel(model, retPred=False):\n",
|
| 749 |
+
" \"\"\"\n",
|
| 750 |
+
" pmodel validation\n",
|
| 751 |
+
"\n",
|
| 752 |
+
" Parameters\n",
|
| 753 |
+
" model : torch model\n",
|
| 754 |
+
" retPred : if True return prediction\n",
|
| 755 |
+
" \"\"\"\n",
|
| 756 |
+
" model.eval()\n",
|
| 757 |
+
" yPred = model(model.validFeatData)\n",
|
| 758 |
+
" yPred = yPred.data.cpu().numpy()\n",
|
| 759 |
+
" model.yPred = yPred\n",
|
| 760 |
+
" yActual = model.validOutData\n",
|
| 761 |
+
" vsize = yPred.shape[0]\n",
|
| 762 |
+
" if model.verbose:\n",
|
| 763 |
+
" print(\"\\npredicted \\t actual\")\n",
|
| 764 |
+
" for i in range(vsize):\n",
|
| 765 |
+
" print(\"{:.3f}\\t\\t{:.3f}\".format(yPred[i][0], yActual[i][0]))\n",
|
| 766 |
+
"\n",
|
| 767 |
+
" score = perfMetric(model.accMetric, yActual, yPred)\n",
|
| 768 |
+
" print(formatFloat(3, score, \"perf score\"))\n",
|
| 769 |
+
"\n",
|
| 770 |
+
" if retPred:\n",
|
| 771 |
+
" y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))\n",
|
| 772 |
+
" res = (y, score)\n",
|
| 773 |
+
" return res\n",
|
| 774 |
+
" else:\t\n",
|
| 775 |
+
" return score"
|
| 776 |
+
]
|
| 777 |
+
}
|
| 778 |
+
],
|
| 779 |
+
"metadata": {
|
| 780 |
+
"kernelspec": {
|
| 781 |
+
"display_name": "Python 3 (ipykernel)",
|
| 782 |
+
"language": "python",
|
| 783 |
+
"name": "python3"
|
| 784 |
+
},
|
| 785 |
+
"language_info": {
|
| 786 |
+
"codemirror_mode": {
|
| 787 |
+
"name": "ipython",
|
| 788 |
+
"version": 3
|
| 789 |
+
},
|
| 790 |
+
"file_extension": ".py",
|
| 791 |
+
"mimetype": "text/x-python",
|
| 792 |
+
"name": "python",
|
| 793 |
+
"nbconvert_exporter": "python",
|
| 794 |
+
"pygments_lexer": "ipython3",
|
| 795 |
+
"version": "3.9.12"
|
| 796 |
+
}
|
| 797 |
+
},
|
| 798 |
+
"nbformat": 4,
|
| 799 |
+
"nbformat_minor": 5
|
| 800 |
+
}
|
lib/.ipynb_checkpoints/txproc-checkpoint.ipynb
ADDED
|
@@ -0,0 +1,1002 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "f720c141",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"from random import randint\n",
|
| 13 |
+
"import random\n",
|
| 14 |
+
"import time\n",
|
| 15 |
+
"from datetime import datetime\n",
|
| 16 |
+
"import re, string, unicodedata\n",
|
| 17 |
+
"import nltk\n",
|
| 18 |
+
"import contractions\n",
|
| 19 |
+
"import inflect\n",
|
| 20 |
+
"from bs4 import BeautifulSoup\n",
|
| 21 |
+
"from nltk import word_tokenize, sent_tokenize\n",
|
| 22 |
+
"from nltk.corpus import stopwords\n",
|
| 23 |
+
"from nltk.stem.isri import ISRIStemmer\n",
|
| 24 |
+
"from nltk.stem.porter import PorterStemmer\n",
|
| 25 |
+
"from nltk.stem.snowball import SnowballStemmer\n",
|
| 26 |
+
"from nltk.stem import LancasterStemmer, WordNetLemmatizer\n",
|
| 27 |
+
"from nltk.tag import StanfordNERTagger\n",
|
| 28 |
+
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
|
| 29 |
+
"import spacy\n",
|
| 30 |
+
"import torch\n",
|
| 31 |
+
"from collections import defaultdict\n",
|
| 32 |
+
"import pickle\n",
|
| 33 |
+
"import numpy as np\n",
|
| 34 |
+
"import re\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"sys.path.append(os.path.abspath(\"../lib\"))\n",
|
| 37 |
+
"from util import *\n",
|
| 38 |
+
"from mlutil import *\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"lcc = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
|
| 41 |
+
"\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
|
| 42 |
+
"ucc = [\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\", \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\"]\n",
|
| 43 |
+
"dig = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
|
| 44 |
+
"spc = [\"@\",\"#\",\"$\",\"%\",\"^\",\"&\",\"*\",\"(\",\")\",\"_\",\"+\",\"{\",\"}\",\"[\",\"]\",\"|\",\":\",\"<\",\">\",\"?\",\";\",\",\",\".\"]\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"class TextPreProcessor:\n",
|
| 48 |
+
" \"\"\"\n",
|
| 49 |
+
" text preprocessor\n",
|
| 50 |
+
" \"\"\"\n",
|
| 51 |
+
" def __init__(self, stemmer = \"lancaster\", verbose=False):\n",
|
| 52 |
+
" self.verbose = verbose\n",
|
| 53 |
+
" self.lemmatizer = WordNetLemmatizer()\n",
|
| 54 |
+
"\n",
|
| 55 |
+
" def stripHtml(self, text):\n",
|
| 56 |
+
" soup = BeautifulSoup(text, \"html.parser\")\n",
|
| 57 |
+
" return soup.get_text()\n",
|
| 58 |
+
"\n",
|
| 59 |
+
" def removeBetweenSquareBrackets(self, text):\n",
|
| 60 |
+
" return re.sub('\\[[^]]*\\]', '', text)\n",
|
| 61 |
+
"\n",
|
| 62 |
+
" def denoiseText(self, text):\n",
|
| 63 |
+
" text = stripHtml(text)\n",
|
| 64 |
+
" text = removeBetweenSquareBrackets(text)\n",
|
| 65 |
+
" return text\n",
|
| 66 |
+
"\n",
|
| 67 |
+
" def replaceContractions(self, text):\n",
|
| 68 |
+
" \"\"\"Replace contractions in string of text\"\"\"\n",
|
| 69 |
+
" return contractions.fix(text)\n",
|
| 70 |
+
"\n",
|
| 71 |
+
" def tokenize(self, text):\n",
|
| 72 |
+
" words = nltk.word_tokenize(text)\n",
|
| 73 |
+
" return words\n",
|
| 74 |
+
"\n",
|
| 75 |
+
" def removeNonAscii(self, words):\n",
|
| 76 |
+
" \"\"\"Remove non-ASCII characters from list of tokenized words\"\"\"\n",
|
| 77 |
+
" newWords = []\n",
|
| 78 |
+
" for word in words:\n",
|
| 79 |
+
" if isinstance(word, unicode):\n",
|
| 80 |
+
" newWord = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')\n",
|
| 81 |
+
" else:\n",
|
| 82 |
+
" newWord = word\n",
|
| 83 |
+
" newWords.append(newWord)\n",
|
| 84 |
+
" return newWords\n",
|
| 85 |
+
"\n",
|
| 86 |
+
" def replaceNonAsciiFromText(self, text):\n",
|
| 87 |
+
" \"\"\" replaces non ascii with blank \"\"\"\n",
|
| 88 |
+
" return ''.join([i if ord(i) < 128 else ' ' for i in text])\n",
|
| 89 |
+
"\n",
|
| 90 |
+
" def removeNonAsciiFromText(self, text):\n",
|
| 91 |
+
" \"\"\" replaces non ascii with blank \"\"\"\n",
|
| 92 |
+
" return ''.join([i if ord(i) < 128 else '' for i in text])\n",
|
| 93 |
+
"\n",
|
| 94 |
+
" def allow(self, words):\n",
|
| 95 |
+
" \"\"\" allow only specific charaters \"\"\"\n",
|
| 96 |
+
" allowed = [word for word in words if re.match('^[A-Za-z0-9\\.\\,\\:\\;\\!\\?\\(\\)\\'\\-\\$\\@\\%\\\"]+$', word) is not None]\t\t\n",
|
| 97 |
+
" return allowed\t\t\n",
|
| 98 |
+
"\n",
|
| 99 |
+
" def toLowercase(self, words):\n",
|
| 100 |
+
" \"\"\"Convert all characters to lowercase from list of tokenized words\"\"\"\n",
|
| 101 |
+
" newWords = [word.lower() for word in words]\n",
|
| 102 |
+
" return newWords\n",
|
| 103 |
+
"\n",
|
| 104 |
+
" def removePunctuation(self, words):\n",
|
| 105 |
+
" \"\"\"Remove punctuation from list of tokenized words\"\"\"\n",
|
| 106 |
+
" newWords = []\n",
|
| 107 |
+
" for word in words:\n",
|
| 108 |
+
" newWord = re.sub(r'[^\\w\\s]', '', word)\n",
|
| 109 |
+
" if newWord != '':\n",
|
| 110 |
+
" newWords.append(newWord)\n",
|
| 111 |
+
" return newWords\n",
|
| 112 |
+
"\n",
|
| 113 |
+
" def replaceNumbers(self, words):\n",
|
| 114 |
+
" \"\"\"Replace all interger occurrences in list of tokenized words with textual representation\"\"\"\n",
|
| 115 |
+
" p = inflect.engine()\n",
|
| 116 |
+
" newWords = []\n",
|
| 117 |
+
" for word in words:\n",
|
| 118 |
+
" if word.isdigit():\n",
|
| 119 |
+
" newWord = p.number_to_words(word)\n",
|
| 120 |
+
" newWords.append(newWord)\n",
|
| 121 |
+
" else:\n",
|
| 122 |
+
" newWords.append(word)\n",
|
| 123 |
+
" return newWords\n",
|
| 124 |
+
"\n",
|
| 125 |
+
" def removeStopwords(self, words):\n",
|
| 126 |
+
" \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
|
| 127 |
+
" newWords = []\n",
|
| 128 |
+
" for word in words:\n",
|
| 129 |
+
" if word not in stopwords.words('english'):\n",
|
| 130 |
+
" newWords.append(word)\n",
|
| 131 |
+
" return newWords\n",
|
| 132 |
+
"\n",
|
| 133 |
+
" def removeCustomStopwords(self, words, stopWords):\n",
|
| 134 |
+
" \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
|
| 135 |
+
" removed = [word for word in words if word not in stopWords]\t\t\n",
|
| 136 |
+
" return removed\n",
|
| 137 |
+
"\n",
|
| 138 |
+
" def removeLowFreqWords(self, words, minFreq):\n",
|
| 139 |
+
" \"\"\"Remove low frewquncy words from list of tokenized words\"\"\"\n",
|
| 140 |
+
" frequency = defaultdict(int)\n",
|
| 141 |
+
" for word in words:\n",
|
| 142 |
+
" frequency[word] += 1\n",
|
| 143 |
+
" removed = [word for word in words if frequency[word] > minFreq]\t\t\n",
|
| 144 |
+
" return removed\t\n",
|
| 145 |
+
"\n",
|
| 146 |
+
" def removeNumbers(self, words):\n",
|
| 147 |
+
" \"\"\"Remove numbers\"\"\"\n",
|
| 148 |
+
" removed = [word for word in words if not isNumber(word)]\t\t\n",
|
| 149 |
+
" return removed\t\t\n",
|
| 150 |
+
"\n",
|
| 151 |
+
" def removeShortWords(self, words, minLengh):\n",
|
| 152 |
+
" \"\"\"Remove short words \"\"\"\n",
|
| 153 |
+
" removed = [word for word in words if len(word) >= minLengh]\t\t\n",
|
| 154 |
+
" return removed\t\t\n",
|
| 155 |
+
"\n",
|
| 156 |
+
" def keepAllowedWords(self, words, keepWords):\n",
|
| 157 |
+
" \"\"\"Keep words from the list only\"\"\"\n",
|
| 158 |
+
" kept = [word for word in words if word in keepWords]\t\t\n",
|
| 159 |
+
" return kept\n",
|
| 160 |
+
"\n",
|
| 161 |
+
" def stemWords(self, words):\n",
|
| 162 |
+
" \"\"\"Stem words in list of tokenized words\"\"\"\n",
|
| 163 |
+
" if stemmer == \"lancaster\":\n",
|
| 164 |
+
" stemmer = LancasterStemmer()\n",
|
| 165 |
+
" elif stemmer == \"snowbal\":\n",
|
| 166 |
+
" stemmer = SnowballStemmer()\n",
|
| 167 |
+
" elif stemmer == \"porter\":\n",
|
| 168 |
+
" stemmer = PorterStemmer()\n",
|
| 169 |
+
" stems = [stemmer.stem(word) for word in words]\n",
|
| 170 |
+
" return stems\n",
|
| 171 |
+
"\n",
|
| 172 |
+
" def lemmatizeWords(self, words):\n",
|
| 173 |
+
" \"\"\"Lemmatize tokens in list of tokenized words\"\"\"\n",
|
| 174 |
+
" lemmas = [self.lemmatizer.lemmatize(word) for word in words]\n",
|
| 175 |
+
" return lemmas\n",
|
| 176 |
+
"\n",
|
| 177 |
+
" def lemmatizeVerbs(self, words):\n",
|
| 178 |
+
" \"\"\"Lemmatize verbs in list of tokenized words\"\"\"\n",
|
| 179 |
+
" lemmas = [self.lemmatizer.lemmatize(word, pos='v') for word in words]\n",
|
| 180 |
+
" return lemmas\n",
|
| 181 |
+
"\n",
|
| 182 |
+
" def normalize(self, words):\n",
|
| 183 |
+
" words = self.removeNonAscii(words)\n",
|
| 184 |
+
" words = self.toLowercase(words)\n",
|
| 185 |
+
" words = self.removePunctuation(words)\n",
|
| 186 |
+
" words = self.replaceNumbers(words)\n",
|
| 187 |
+
" words = self.removeStopwords(words)\n",
|
| 188 |
+
" return words\n",
|
| 189 |
+
"\n",
|
| 190 |
+
" def posTag(self, textTokens):\n",
|
| 191 |
+
" tags = nltk.pos_tag(textTokens)\n",
|
| 192 |
+
" return tags\n",
|
| 193 |
+
"\n",
|
| 194 |
+
" def extractEntity(self, textTokens, classifierPath, jarPath):\n",
|
| 195 |
+
" st = StanfordNERTagger(classifierPath, jarPath) \n",
|
| 196 |
+
" entities = st.tag(textTokens)\n",
|
| 197 |
+
" return entities\n",
|
| 198 |
+
"\n",
|
| 199 |
+
" def documentFeatures(self, document, wordFeatures):\n",
|
| 200 |
+
" documentWords = set(document)\n",
|
| 201 |
+
" features = {}\n",
|
| 202 |
+
" for word in wordFeatures:\n",
|
| 203 |
+
" features[word] = (word in documentWords)\n",
|
| 204 |
+
" return features\n",
|
| 205 |
+
"\n",
|
| 206 |
+
"class NGram:\n",
|
| 207 |
+
" \"\"\"\n",
|
| 208 |
+
" word ngram\n",
|
| 209 |
+
" \"\"\"\n",
|
| 210 |
+
" def __init__(self, vocFilt, verbose=False):\n",
|
| 211 |
+
" \"\"\"\n",
|
| 212 |
+
" initialize\n",
|
| 213 |
+
" \"\"\"\n",
|
| 214 |
+
" self.vocFilt = vocFilt\n",
|
| 215 |
+
" self.nGramCounter = dict()\n",
|
| 216 |
+
" self.nGramFreq = dict()\n",
|
| 217 |
+
" self.corpSize = 0\n",
|
| 218 |
+
" self.vocabulary = set()\n",
|
| 219 |
+
" self.freqDone = False\n",
|
| 220 |
+
" self.verbose = verbose\n",
|
| 221 |
+
" self.vecWords = None\n",
|
| 222 |
+
" self.nonZeroCount = 0\n",
|
| 223 |
+
"\n",
|
| 224 |
+
" def countDocNGrams(self, words):\n",
|
| 225 |
+
" \"\"\"\n",
|
| 226 |
+
" count words in a doc\n",
|
| 227 |
+
" \"\"\"\n",
|
| 228 |
+
" if self.verbose:\n",
|
| 229 |
+
" print (\"doc size \" + str(len(words)))\n",
|
| 230 |
+
" nGrams = self.toNGram(words)\n",
|
| 231 |
+
" for nGram in nGrams:\n",
|
| 232 |
+
" count = self.nGramCounter.get(nGram, 0)\n",
|
| 233 |
+
" self.nGramCounter[nGram] = count + 1\n",
|
| 234 |
+
" self.corpSize += 1\n",
|
| 235 |
+
" self.vocabulary.update(words)\t\n",
|
| 236 |
+
"\n",
|
| 237 |
+
" def remLowCount(self, minCount):\n",
|
| 238 |
+
" \"\"\"\n",
|
| 239 |
+
" removes items with count below threshold\n",
|
| 240 |
+
" \"\"\"\n",
|
| 241 |
+
" self.nGramCounter = dict(filter(lambda item: item[1] >= minCount, self.nGramCounter.items()))\n",
|
| 242 |
+
"\n",
|
| 243 |
+
" def getVocabSize(self):\n",
|
| 244 |
+
" \"\"\"\n",
|
| 245 |
+
" get vocabulary size\n",
|
| 246 |
+
" \"\"\"\n",
|
| 247 |
+
" return len(self.nGramCounter)\n",
|
| 248 |
+
"\n",
|
| 249 |
+
" def getNGramFreq(self):\n",
|
| 250 |
+
" \"\"\"\n",
|
| 251 |
+
" get normalized count\n",
|
| 252 |
+
" \"\"\"\n",
|
| 253 |
+
" if self.verbose:\n",
|
| 254 |
+
" print (\"counter size \" + str(len(self.nGramCounter)))\n",
|
| 255 |
+
" if not self.freqDone:\n",
|
| 256 |
+
" for item in self.nGramCounter.items():\n",
|
| 257 |
+
" self.nGramFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
|
| 258 |
+
" self.freqDone = True\n",
|
| 259 |
+
" return self.nGramFreq\n",
|
| 260 |
+
"\n",
|
| 261 |
+
" def getNGramIndex(self, show):\n",
|
| 262 |
+
" \"\"\"\n",
|
| 263 |
+
" convert to list\n",
|
| 264 |
+
" \"\"\"\n",
|
| 265 |
+
" if self.vecWords is None:\n",
|
| 266 |
+
" self.vecWords = list(self.nGramCounter)\n",
|
| 267 |
+
" if show:\n",
|
| 268 |
+
" for vw in enumerate(self.vecWords):\n",
|
| 269 |
+
" print(vw)\n",
|
| 270 |
+
"\n",
|
| 271 |
+
" def getVector(self, words, byCount, normalized):\n",
|
| 272 |
+
" \"\"\"\n",
|
| 273 |
+
" convert to vector\n",
|
| 274 |
+
" \"\"\"\n",
|
| 275 |
+
" if self.vecWords is None:\n",
|
| 276 |
+
" self.vecWords = list(self.nGramCounter)\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" nGrams = self.toNGram(words)\n",
|
| 279 |
+
" if self.verbose:\n",
|
| 280 |
+
" print(\"vocabulary size {}\".format(len(self.vecWords)))\n",
|
| 281 |
+
" print(\"ngrams\")\n",
|
| 282 |
+
" print(nGrams)\n",
|
| 283 |
+
" self.nonZeroCount = 0\n",
|
| 284 |
+
" vec = list(map(lambda vw: self.getVecElem(vw, nGrams, byCount, normalized), self.vecWords))\n",
|
| 285 |
+
" return vec\n",
|
| 286 |
+
"\n",
|
| 287 |
+
" def getVecElem(self, vw, nGrams, byCount, normalized):\n",
|
| 288 |
+
" \"\"\"\n",
|
| 289 |
+
" get vector element\n",
|
| 290 |
+
" \"\"\"\n",
|
| 291 |
+
" if vw in nGrams:\n",
|
| 292 |
+
" if byCount:\n",
|
| 293 |
+
" if normalized:\n",
|
| 294 |
+
" el = self.nGramFreq[vw]\n",
|
| 295 |
+
" else:\n",
|
| 296 |
+
" el = self.nGramCounter[vw]\n",
|
| 297 |
+
" else:\n",
|
| 298 |
+
" el = 1\n",
|
| 299 |
+
" self.nonZeroCount += 1\n",
|
| 300 |
+
" else:\n",
|
| 301 |
+
" if (byCount and normalized):\n",
|
| 302 |
+
" el = 0.0\n",
|
| 303 |
+
" else:\n",
|
| 304 |
+
" el = 0\n",
|
| 305 |
+
" return el\n",
|
| 306 |
+
"\n",
|
| 307 |
+
" def getNonZeroCount(self):\n",
|
| 308 |
+
" \"\"\"\n",
|
| 309 |
+
" get non zero vector element count\n",
|
| 310 |
+
" \"\"\"\n",
|
| 311 |
+
" return self.nonZeroCount\n",
|
| 312 |
+
"\n",
|
| 313 |
+
" def toBiGram(self, words):\n",
|
| 314 |
+
" \"\"\"\n",
|
| 315 |
+
" convert to bigram\n",
|
| 316 |
+
" \"\"\"\n",
|
| 317 |
+
" if self.verbose:\n",
|
| 318 |
+
" print (\"doc size \" + str(len(words)))\n",
|
| 319 |
+
" biGrams = list()\n",
|
| 320 |
+
" for i in range(len(words)-1):\n",
|
| 321 |
+
" w1 = words[i]\n",
|
| 322 |
+
" w2 = words[i+1]\n",
|
| 323 |
+
" if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt):\n",
|
| 324 |
+
" nGram = (w1, w2)\n",
|
| 325 |
+
" biGrams.append(nGram)\n",
|
| 326 |
+
" return biGrams\n",
|
| 327 |
+
"\n",
|
| 328 |
+
" def toTriGram(self, words):\n",
|
| 329 |
+
" \"\"\"\n",
|
| 330 |
+
" convert to trigram\n",
|
| 331 |
+
" \"\"\"\n",
|
| 332 |
+
" if self.verbose:\n",
|
| 333 |
+
" print (\"doc size \" + str(len(words)))\n",
|
| 334 |
+
" triGrams = list()\n",
|
| 335 |
+
" for i in range(len(words)-2):\n",
|
| 336 |
+
" w1 = words[i]\n",
|
| 337 |
+
" w2 = words[i+1]\n",
|
| 338 |
+
" w3 = words[i+2]\n",
|
| 339 |
+
" if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt and w3 in self.vocFilt):\n",
|
| 340 |
+
" nGram = (w1, w2, w3)\n",
|
| 341 |
+
" triGrams.append(nGram)\n",
|
| 342 |
+
" return triGrams\n",
|
| 343 |
+
"\n",
|
| 344 |
+
" def save(self, saveFile):\n",
|
| 345 |
+
" \"\"\"\n",
|
| 346 |
+
" save \n",
|
| 347 |
+
" \"\"\"\n",
|
| 348 |
+
" sf = open(saveFile, \"wb\")\n",
|
| 349 |
+
" pickle.dump(self, sf)\n",
|
| 350 |
+
" sf.close()\n",
|
| 351 |
+
"\n",
|
| 352 |
+
" @staticmethod\n",
|
| 353 |
+
" def load(saveFile):\n",
|
| 354 |
+
" \"\"\"\n",
|
| 355 |
+
" load\n",
|
| 356 |
+
" \"\"\"\n",
|
| 357 |
+
" sf = open(saveFile, \"rb\")\n",
|
| 358 |
+
" nGrams = pickle.load(sf)\n",
|
| 359 |
+
" sf.close()\n",
|
| 360 |
+
" return nGrams\n",
|
| 361 |
+
"\n",
|
| 362 |
+
"class CharNGram:\n",
|
| 363 |
+
" \"\"\"\n",
|
| 364 |
+
" character n gram\n",
|
| 365 |
+
" \"\"\"\n",
|
| 366 |
+
" def __init__(self, domains, ngsize, verbose=False):\n",
|
| 367 |
+
" \"\"\"\n",
|
| 368 |
+
" initialize\n",
|
| 369 |
+
" \"\"\"\n",
|
| 370 |
+
" self.chDomain = list()\n",
|
| 371 |
+
" self.ws = \"#\"\n",
|
| 372 |
+
" self.chDomain.append(self.ws)\n",
|
| 373 |
+
" for d in domains:\n",
|
| 374 |
+
" if d == \"lcc\":\n",
|
| 375 |
+
" self.chDomain.extend(lcc)\n",
|
| 376 |
+
" elif d == \"ucc\":\n",
|
| 377 |
+
" self.chDomain.extend(ucc)\n",
|
| 378 |
+
" elif d == \"dig\":\n",
|
| 379 |
+
" self.chDomain.extend(dig)\n",
|
| 380 |
+
" elif d == \"spc\":\n",
|
| 381 |
+
" self.chDomain.extend(spc)\n",
|
| 382 |
+
" else:\n",
|
| 383 |
+
" raise ValueError(\"invalid character type \" + d)\n",
|
| 384 |
+
"\n",
|
| 385 |
+
" self.ngsize = ngsize\n",
|
| 386 |
+
" self.radixPow = None\n",
|
| 387 |
+
" self.cntVecSize = None\n",
|
| 388 |
+
"\n",
|
| 389 |
+
" def addSpChar(self, spChar):\n",
|
| 390 |
+
" \"\"\"\n",
|
| 391 |
+
" add special characters\n",
|
| 392 |
+
" \"\"\"\n",
|
| 393 |
+
" self.chDomain.extend(spChar)\n",
|
| 394 |
+
"\n",
|
| 395 |
+
" def setWsRepl(self, ws):\n",
|
| 396 |
+
" \"\"\"\n",
|
| 397 |
+
" set white space replacement charater\n",
|
| 398 |
+
" \"\"\"\n",
|
| 399 |
+
" self.ws = ws\n",
|
| 400 |
+
" self.chDomain[0] = self.ws\n",
|
| 401 |
+
"\n",
|
| 402 |
+
" def finalize(self):\n",
|
| 403 |
+
" \"\"\"\n",
|
| 404 |
+
" final setup\n",
|
| 405 |
+
" \"\"\"\t\t\n",
|
| 406 |
+
" domSize = len(self.chDomain)\n",
|
| 407 |
+
" self.cntVecSize = int(math.pow(domSize, self.ngsize))\n",
|
| 408 |
+
" if self.radixPow is None:\n",
|
| 409 |
+
" self.radixPow = list()\n",
|
| 410 |
+
" for i in range(self.ngsize-1, 0, -1):\n",
|
| 411 |
+
" self.radixPow.append(int(math.pow(domSize, i)))\n",
|
| 412 |
+
" self.radixPow.append(1)\n",
|
| 413 |
+
"\n",
|
| 414 |
+
"\n",
|
| 415 |
+
" def toMgramCount(self, text):\n",
|
| 416 |
+
" \"\"\"\n",
|
| 417 |
+
" get ngram count list\n",
|
| 418 |
+
" \"\"\"\n",
|
| 419 |
+
" #print(text)\n",
|
| 420 |
+
" ngCounts = [0] * self.cntVecSize\n",
|
| 421 |
+
"\n",
|
| 422 |
+
" ngram = list()\n",
|
| 423 |
+
" totNgCount = 0\n",
|
| 424 |
+
" for ch in text:\n",
|
| 425 |
+
" if ch.isspace():\n",
|
| 426 |
+
" l = len(ngram)\n",
|
| 427 |
+
" if l == 0 or ngram[l-1] != self.ws:\n",
|
| 428 |
+
" ngram.append(self.ws)\n",
|
| 429 |
+
" else:\n",
|
| 430 |
+
" ngram.append(ch)\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" if len(ngram) == self.ngsize:\n",
|
| 433 |
+
" i = self.__getNgramIndex(ngram)\n",
|
| 434 |
+
" assert i < self.cntVecSize, \"ngram index out of range index \" + str(i) + \" size \" + str(self.cntVecSize) \n",
|
| 435 |
+
" ngCounts[i] += 1\n",
|
| 436 |
+
" ngram.clear()\n",
|
| 437 |
+
" totNgCount += 1\n",
|
| 438 |
+
"\n",
|
| 439 |
+
" return ngCounts\n",
|
| 440 |
+
"\n",
|
| 441 |
+
" def __getNgramIndex(self, ngram):\n",
|
| 442 |
+
" \"\"\"\n",
|
| 443 |
+
" get index of an ngram into a list of size equal total number of possible ngrams\n",
|
| 444 |
+
" \"\"\"\n",
|
| 445 |
+
" assert len(ngram) == len(self.radixPow), \"ngram size mismatch\"\t\t\n",
|
| 446 |
+
" ngi = 0\n",
|
| 447 |
+
" for ch, rp in zip(ngram, self.radixPow):\n",
|
| 448 |
+
" i = self.chDomain.index(ch)\n",
|
| 449 |
+
" ngi += i * rp\n",
|
| 450 |
+
"\n",
|
| 451 |
+
" return ngi\n",
|
| 452 |
+
"\n",
|
| 453 |
+
"\n",
|
| 454 |
+
"class TfIdf:\n",
|
| 455 |
+
" \"\"\"\n",
|
| 456 |
+
" TF IDF\t\n",
|
| 457 |
+
" \"\"\"\n",
|
| 458 |
+
" def __init__(self, vocFilt, doIdf, verbose=False):\n",
|
| 459 |
+
" \"\"\"\n",
|
| 460 |
+
" initialize\n",
|
| 461 |
+
" \"\"\"\n",
|
| 462 |
+
" self.vocFilt = vocFilt\n",
|
| 463 |
+
" self.doIdf = doIdf\n",
|
| 464 |
+
" self.wordCounter = {}\n",
|
| 465 |
+
" self.wordFreq = {}\n",
|
| 466 |
+
" self.wordInDocCount = {}\n",
|
| 467 |
+
" self.docCount = 0\n",
|
| 468 |
+
" self.corpSize = 0\n",
|
| 469 |
+
" self.freqDone = False\n",
|
| 470 |
+
" self.vocabulary = set()\n",
|
| 471 |
+
" self.wordIndex = None\n",
|
| 472 |
+
" self.verbose = verbose\n",
|
| 473 |
+
" self.vecWords = None\n",
|
| 474 |
+
"\n",
|
| 475 |
+
" def countDocWords(self, words):\n",
|
| 476 |
+
" \"\"\"\n",
|
| 477 |
+
" count words in a doc\n",
|
| 478 |
+
" \"\"\"\n",
|
| 479 |
+
" if self.verbose:\n",
|
| 480 |
+
" print (\"doc size \" + str(len(words)))\n",
|
| 481 |
+
" for word in words:\n",
|
| 482 |
+
" if self.vocFilt is None or word in self.vocFilt:\n",
|
| 483 |
+
" count = self.wordCounter.get(word, 0)\n",
|
| 484 |
+
" self.wordCounter[word] = count + 1\n",
|
| 485 |
+
" self.corpSize += len(words)\n",
|
| 486 |
+
" self.vocabulary.update(words)\n",
|
| 487 |
+
"\n",
|
| 488 |
+
" if (self.doIdf):\n",
|
| 489 |
+
" self.docCount += 1\n",
|
| 490 |
+
" for word in set(words):\n",
|
| 491 |
+
" self.wordInDocCount.get(word, 0)\n",
|
| 492 |
+
" self.wordInDocCount[word] = count + 1\n",
|
| 493 |
+
" self.freqDone = False\n",
|
| 494 |
+
"\n",
|
| 495 |
+
"\n",
|
| 496 |
+
" def getWordFreq(self):\n",
|
| 497 |
+
" \"\"\"\n",
|
| 498 |
+
" get tfidf for corpus\n",
|
| 499 |
+
" \"\"\"\n",
|
| 500 |
+
" if self.verbose:\n",
|
| 501 |
+
" print (\"counter size \" + str(len(self.wordCounter)))\n",
|
| 502 |
+
" if not self.freqDone:\n",
|
| 503 |
+
" for item in self.wordCounter.items():\n",
|
| 504 |
+
" self.wordFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
|
| 505 |
+
" if self.doIdf:\n",
|
| 506 |
+
" for k in self.wordFreq.keys():\n",
|
| 507 |
+
" self.wordFreq.items[k] *= math.log(self.docCount / self.wordInDocCount.items[k])\t\n",
|
| 508 |
+
" self.freqDone = True\n",
|
| 509 |
+
" return self.wordFreq\n",
|
| 510 |
+
"\n",
|
| 511 |
+
" def getCount(self, word):\n",
|
| 512 |
+
" \"\"\"\n",
|
| 513 |
+
" get counter\n",
|
| 514 |
+
" \"\"\"\n",
|
| 515 |
+
" if word in self.wordCounter:\n",
|
| 516 |
+
" count = self.wordCounter[word]\n",
|
| 517 |
+
" else:\n",
|
| 518 |
+
" raise ValueError(\"word not found in count table \" + word)\n",
|
| 519 |
+
" return count\n",
|
| 520 |
+
"\n",
|
| 521 |
+
" def getFreq(self, word):\n",
|
| 522 |
+
" \"\"\"\n",
|
| 523 |
+
" get normalized frequency\n",
|
| 524 |
+
" \"\"\"\n",
|
| 525 |
+
" if word in self.wordFreq:\n",
|
| 526 |
+
" freq = self.wordFreq[word]\n",
|
| 527 |
+
" else:\n",
|
| 528 |
+
" raise ValueError(\"word not found in count table \" + word)\n",
|
| 529 |
+
" return freq\n",
|
| 530 |
+
"\n",
|
| 531 |
+
" def resetCounter(self):\n",
|
| 532 |
+
" \"\"\"\n",
|
| 533 |
+
" reset counter\n",
|
| 534 |
+
" \"\"\"\n",
|
| 535 |
+
" self.wordCounter = {}\n",
|
| 536 |
+
"\n",
|
| 537 |
+
" def buildVocabulary(self, words):\n",
|
| 538 |
+
" \"\"\"\n",
|
| 539 |
+
" build vocbulary\n",
|
| 540 |
+
" \"\"\"\n",
|
| 541 |
+
" self.vocabulary.update(words)\n",
|
| 542 |
+
"\n",
|
| 543 |
+
" def getVocabulary(self):\n",
|
| 544 |
+
" \"\"\"\n",
|
| 545 |
+
" return vocabulary\n",
|
| 546 |
+
" \"\"\"\n",
|
| 547 |
+
" return self.vocabulary\n",
|
| 548 |
+
"\n",
|
| 549 |
+
" def creatWordIndex(self):\n",
|
| 550 |
+
" \"\"\"\n",
|
| 551 |
+
" index for all words in vcabulary\n",
|
| 552 |
+
" \"\"\"\n",
|
| 553 |
+
" self.wordIndex = {word : idx for idx, word in enumerate(list(self.vocabulary))}\n",
|
| 554 |
+
"\n",
|
| 555 |
+
" def getVector(self, words, byCount, normalized):\n",
|
| 556 |
+
" \"\"\"\n",
|
| 557 |
+
" get vector\n",
|
| 558 |
+
" \"\"\"\n",
|
| 559 |
+
" if self.vecWords is None:\n",
|
| 560 |
+
" self.vecWords = list(self.wordCounter)\n",
|
| 561 |
+
" vec = list(map(lambda vw: self.getVecElem(vw, words, byCount, normalized), self.vecWords))\n",
|
| 562 |
+
" return vec\n",
|
| 563 |
+
"\n",
|
| 564 |
+
" def getVecElem(self, vw, words, byCount, normalized):\n",
|
| 565 |
+
" \"\"\"\n",
|
| 566 |
+
" vector element\n",
|
| 567 |
+
" \"\"\"\n",
|
| 568 |
+
" el = 0\n",
|
| 569 |
+
" if vw in words:\n",
|
| 570 |
+
" if byCount:\n",
|
| 571 |
+
" if normalized:\n",
|
| 572 |
+
" el = self.wordFreq[vw]\n",
|
| 573 |
+
" else:\n",
|
| 574 |
+
" el = self.wordCounter[vw]\n",
|
| 575 |
+
" else:\n",
|
| 576 |
+
" el = 1\n",
|
| 577 |
+
" return el\n",
|
| 578 |
+
"\n",
|
| 579 |
+
" def save(self, saveFile):\n",
|
| 580 |
+
" \"\"\"\n",
|
| 581 |
+
" save\n",
|
| 582 |
+
" \"\"\"\n",
|
| 583 |
+
" sf = open(saveFile, \"wb\")\n",
|
| 584 |
+
" pickle.dump(self, sf)\n",
|
| 585 |
+
" sf.close()\n",
|
| 586 |
+
"\n",
|
| 587 |
+
" # load \n",
|
| 588 |
+
" @staticmethod\n",
|
| 589 |
+
" def load(saveFile):\n",
|
| 590 |
+
" \"\"\"\n",
|
| 591 |
+
" load\n",
|
| 592 |
+
" \"\"\"\n",
|
| 593 |
+
" sf = open(saveFile, \"rb\")\n",
|
| 594 |
+
" tfidf = pickle.load(sf)\n",
|
| 595 |
+
" sf.close()\n",
|
| 596 |
+
" return tfidf\n",
|
| 597 |
+
"\n",
|
| 598 |
+
"# bigram\n",
|
| 599 |
+
"class BiGram(NGram):\n",
|
| 600 |
+
" def __init__(self, vocFilt, verbose=False):\n",
|
| 601 |
+
" \"\"\"\n",
|
| 602 |
+
" initialize\n",
|
| 603 |
+
" \"\"\"\n",
|
| 604 |
+
" super(BiGram, self).__init__(vocFilt, verbose)\n",
|
| 605 |
+
"\n",
|
| 606 |
+
" def toNGram(self, words):\n",
|
| 607 |
+
" \"\"\"\n",
|
| 608 |
+
" convert to Ngrams\n",
|
| 609 |
+
" \"\"\"\n",
|
| 610 |
+
" return self.toBiGram(words)\n",
|
| 611 |
+
"\n",
|
| 612 |
+
"# trigram\n",
|
| 613 |
+
"class TriGram(NGram):\n",
|
| 614 |
+
" def __init__(self, vocFilt, verbose=False):\n",
|
| 615 |
+
" \"\"\"\n",
|
| 616 |
+
" initialize\n",
|
| 617 |
+
" \"\"\"\n",
|
| 618 |
+
" super(TriGram, self).__init__(vocFilt, verbose)\n",
|
| 619 |
+
"\n",
|
| 620 |
+
" def toNGram(self, words):\n",
|
| 621 |
+
" \"\"\"\n",
|
| 622 |
+
" convert to Ngrams\n",
|
| 623 |
+
" \"\"\"\n",
|
| 624 |
+
" return self.toTriGram(words)\n",
|
| 625 |
+
"\n",
|
| 626 |
+
"\n",
|
| 627 |
+
"\n",
|
| 628 |
+
"class DocSentences:\n",
|
| 629 |
+
" \"\"\"\n",
|
| 630 |
+
" sentence processor\n",
|
| 631 |
+
" \"\"\"\n",
|
| 632 |
+
" def __init__(self, filePath, minLength, verbose, text=None):\n",
|
| 633 |
+
" \"\"\"\n",
|
| 634 |
+
" initialize\n",
|
| 635 |
+
" \"\"\"\n",
|
| 636 |
+
" if filePath:\n",
|
| 637 |
+
" self.filePath = filePath\n",
|
| 638 |
+
" with open(filePath, 'r') as contentFile:\n",
|
| 639 |
+
" content = contentFile.read()\n",
|
| 640 |
+
" elif text:\n",
|
| 641 |
+
" content = text\n",
|
| 642 |
+
" else:\n",
|
| 643 |
+
" raise valueError(\"either file path or text must be provided\")\n",
|
| 644 |
+
"\n",
|
| 645 |
+
" #self.sentences = content.split('.')\n",
|
| 646 |
+
" self.verbose = verbose\n",
|
| 647 |
+
" tp = TextPreProcessor()\n",
|
| 648 |
+
" content = tp.removeNonAsciiFromText(content)\n",
|
| 649 |
+
" sentences = sent_tokenize(content)\n",
|
| 650 |
+
" self.sentences = list(filter(lambda s: len(nltk.word_tokenize(s)) >= minLength, sentences))\n",
|
| 651 |
+
" if self.verbose:\n",
|
| 652 |
+
" print (\"num of senteces after length filter \" + str(len(self.sentences)))\n",
|
| 653 |
+
" self.sentencesAsTokens = [clean(s, tp, verbose) for s in self.sentences]\t\n",
|
| 654 |
+
"\n",
|
| 655 |
+
" # get sentence tokens\n",
|
| 656 |
+
" def getSentencesAsTokens(self):\n",
|
| 657 |
+
" return self.sentencesAsTokens\n",
|
| 658 |
+
"\n",
|
| 659 |
+
" # get sentences\n",
|
| 660 |
+
" def getSentences(self):\n",
|
| 661 |
+
" return self.sentences\n",
|
| 662 |
+
"\n",
|
| 663 |
+
" # build term freq table\n",
|
| 664 |
+
" def getTermFreqTable(self):\n",
|
| 665 |
+
" # term count table for all words\n",
|
| 666 |
+
" termTable = TfIdf(None, False)\n",
|
| 667 |
+
" sentWords = self.getSentencesAsTokens()\n",
|
| 668 |
+
" for seWords in sentWords:\n",
|
| 669 |
+
" termTable.countDocWords(seWords)\n",
|
| 670 |
+
" return termTable\n",
|
| 671 |
+
"\n",
|
| 672 |
+
"# sentence processor\n",
|
| 673 |
+
"class WordVectorContainer:\n",
|
| 674 |
+
" def __init__(self, dirPath, verbose):\n",
|
| 675 |
+
" \"\"\"\n",
|
| 676 |
+
" initialize\n",
|
| 677 |
+
" \"\"\"\n",
|
| 678 |
+
" self.docs = list()\n",
|
| 679 |
+
" self.wordVectors = list()\n",
|
| 680 |
+
" self.tp = TextPreProcessor()\n",
|
| 681 |
+
" self.similarityAlgo = \"cosine\"\n",
|
| 682 |
+
" self.simAlgoNormalizer = None\n",
|
| 683 |
+
" self.termTable = None\n",
|
| 684 |
+
"\n",
|
| 685 |
+
"\n",
|
| 686 |
+
" def addDir(self, dirPath):\n",
|
| 687 |
+
" \"\"\"\n",
|
| 688 |
+
" add content of all files ina directory\n",
|
| 689 |
+
" \"\"\"\n",
|
| 690 |
+
" docs, filePaths = getFileContent(dirPath, verbose)\n",
|
| 691 |
+
" self.docs.extend(docs)\n",
|
| 692 |
+
" self.wordVectors.extend([clean(doc, self.tp, verbose) for doc in docs])\n",
|
| 693 |
+
"\n",
|
| 694 |
+
" def addFile(self, filePath):\n",
|
| 695 |
+
" \"\"\"\n",
|
| 696 |
+
" add file content\n",
|
| 697 |
+
" \"\"\"\n",
|
| 698 |
+
" with open(filePath, 'r') as contentFile:\n",
|
| 699 |
+
" content = contentFile.read()\n",
|
| 700 |
+
" self.wordVectors.append(clean(content, self.tp, verbose))\n",
|
| 701 |
+
"\n",
|
| 702 |
+
" def addText(self, text):\n",
|
| 703 |
+
" \"\"\"\n",
|
| 704 |
+
" add text\n",
|
| 705 |
+
" \"\"\"\n",
|
| 706 |
+
" self.wordVectors.append(clean(text, self.tp, verbose))\n",
|
| 707 |
+
"\n",
|
| 708 |
+
" def addWords(self, words):\n",
|
| 709 |
+
" \"\"\"\n",
|
| 710 |
+
" add words\n",
|
| 711 |
+
" \"\"\"\n",
|
| 712 |
+
" self.wordVectors.append(words)\n",
|
| 713 |
+
"\n",
|
| 714 |
+
" def withSimilarityAlgo(self, algo, normalizer=None):\n",
|
| 715 |
+
" \"\"\"\n",
|
| 716 |
+
" set similarity algo\n",
|
| 717 |
+
" \"\"\"\n",
|
| 718 |
+
" self.similarityAlgo = algo\n",
|
| 719 |
+
" self.simAlgoNormalizer = normalizer\n",
|
| 720 |
+
"\n",
|
| 721 |
+
" def getDocsWords(self):\n",
|
| 722 |
+
" \"\"\"\n",
|
| 723 |
+
" get word vectors\n",
|
| 724 |
+
" \"\"\"\n",
|
| 725 |
+
" return self.wordVectors\n",
|
| 726 |
+
"\n",
|
| 727 |
+
" def getDocs(self):\n",
|
| 728 |
+
" \"\"\"\n",
|
| 729 |
+
" get docs\n",
|
| 730 |
+
" \"\"\"\n",
|
| 731 |
+
" return self.docs\n",
|
| 732 |
+
"\n",
|
| 733 |
+
" def getTermFreqTable(self):\n",
|
| 734 |
+
" \"\"\"\n",
|
| 735 |
+
" term count table for all words\n",
|
| 736 |
+
" \"\"\"\n",
|
| 737 |
+
" self.termTable = TfIdf(None, False)\n",
|
| 738 |
+
" for words in self.wordVectors:\n",
|
| 739 |
+
" self.termTable.countDocWords(words)\n",
|
| 740 |
+
" self.termTable.getWordFreq()\n",
|
| 741 |
+
" return self.termTable\n",
|
| 742 |
+
"\n",
|
| 743 |
+
" def getPairWiseSimilarity(self, byCount, normalized):\n",
|
| 744 |
+
" \"\"\"\n",
|
| 745 |
+
" pair wise similarity\n",
|
| 746 |
+
" \"\"\"\n",
|
| 747 |
+
" self.getNumWordVectors()\n",
|
| 748 |
+
"\n",
|
| 749 |
+
" size = len(self.wordVectors)\n",
|
| 750 |
+
" simArray = np.empty(shape=(size,size))\n",
|
| 751 |
+
" for i in range(size):\n",
|
| 752 |
+
" simArray[i][i] = 1.0\n",
|
| 753 |
+
"\n",
|
| 754 |
+
" for i in range(size):\n",
|
| 755 |
+
" for j in range(i+1, size):\n",
|
| 756 |
+
" if self.similarityAlgo == \"cosine\":\n",
|
| 757 |
+
" sim = cosineSimilarity(self.numWordVectors[i], self.numWordVectors[j])\n",
|
| 758 |
+
" elif self.similarityAlgo == \"jaccard\":\n",
|
| 759 |
+
" sim = jaccardSimilarity(self.wordVectors[i], self.wordVectors[j],\\\n",
|
| 760 |
+
" self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
|
| 761 |
+
" else:\n",
|
| 762 |
+
" raise ValueError(\"invalid similarity algorithms\")\n",
|
| 763 |
+
" simArray[i][j] = sim\n",
|
| 764 |
+
" simArray[j][i] = sim\n",
|
| 765 |
+
" return simArray\n",
|
| 766 |
+
"\n",
|
| 767 |
+
" def getInterSetSimilarity(self, byCount, normalized, split):\n",
|
| 768 |
+
" \"\"\"\n",
|
| 769 |
+
" inter set pair wise similarity\n",
|
| 770 |
+
" \"\"\"\n",
|
| 771 |
+
" self.getNumWordVectors()\n",
|
| 772 |
+
" size = len(self.wordVectors)\n",
|
| 773 |
+
" if not self.similarityAlgo == \"jaccard\":\n",
|
| 774 |
+
" firstNumVec = self.numWordVectors[:split]\n",
|
| 775 |
+
" secNumVec = self.numWordVectors[split:]\n",
|
| 776 |
+
" fiSize = len(firstNumVec)\n",
|
| 777 |
+
" seSize = len(secNumVec)\n",
|
| 778 |
+
" else:\n",
|
| 779 |
+
" firstVec = self.wordVectors[:split]\n",
|
| 780 |
+
" secVec = self.wordVectors[split:]\n",
|
| 781 |
+
" fiSize = len(firstVec)\n",
|
| 782 |
+
" seSize = len(secVec)\n",
|
| 783 |
+
"\n",
|
| 784 |
+
" simArray = np.empty(shape=(fiSize,seSize))\n",
|
| 785 |
+
" for i in range(fiSize):\n",
|
| 786 |
+
" for j in range(seSize):\n",
|
| 787 |
+
" if self.similarityAlgo == \"cosine\":\n",
|
| 788 |
+
" sim = cosineSimilarity(firstNumVec[i], secNumVec[j])\n",
|
| 789 |
+
" elif self.similarityAlgo == \"jaccard\":\n",
|
| 790 |
+
" sim = jaccardSimilarity(firstVec[i], secVec[j],\\\n",
|
| 791 |
+
" self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
|
| 792 |
+
" else:\n",
|
| 793 |
+
" raise ValueError(\"invalid similarity algorithms\")\n",
|
| 794 |
+
" simArray[i][j] = sim\n",
|
| 795 |
+
" return simArray\n",
|
| 796 |
+
"\n",
|
| 797 |
+
" def getNumWordVectors(self):\n",
|
| 798 |
+
" \"\"\"\n",
|
| 799 |
+
" get vectors\n",
|
| 800 |
+
" \"\"\"\n",
|
| 801 |
+
" if not self.similarityAlgo == \"jaccard\":\n",
|
| 802 |
+
" if self.numWordVectors is None:\n",
|
| 803 |
+
" self.numWordVectors = list(map(lambda wv: self.termTable.getVector(wv, byCount, normalized), self.wordVectors))\n",
|
| 804 |
+
"\n",
|
| 805 |
+
"# fragments documents into whole doc, paragraph or passages\n",
|
| 806 |
+
"class TextFragmentGenerator:\n",
|
| 807 |
+
" def __init__(self, level, minParNl, passSize, verbose=False):\n",
|
| 808 |
+
" \"\"\"\n",
|
| 809 |
+
" initialize\n",
|
| 810 |
+
" \"\"\"\n",
|
| 811 |
+
" self.level = level\n",
|
| 812 |
+
" self.minParNl = minParNl\n",
|
| 813 |
+
" self.passSize = passSize\n",
|
| 814 |
+
" self.fragments = None\n",
|
| 815 |
+
" self.verbose = verbose\n",
|
| 816 |
+
"\n",
|
| 817 |
+
" def loadDocs(self, fpaths):\n",
|
| 818 |
+
" \"\"\"\n",
|
| 819 |
+
" loads documents from one file, multiple files or all files under directory\n",
|
| 820 |
+
" \"\"\"\n",
|
| 821 |
+
" fPaths = fpaths.split(\",\")\n",
|
| 822 |
+
" if len(fPaths) == 1:\n",
|
| 823 |
+
" if os.path.isfile(fPaths[0]):\n",
|
| 824 |
+
" #one file\n",
|
| 825 |
+
" if self.verbose:\n",
|
| 826 |
+
" print(\"got one file from path\")\n",
|
| 827 |
+
" dnames = fPaths\n",
|
| 828 |
+
" docStr = getOneFileContent(fPaths[0])\n",
|
| 829 |
+
" dtexts = [docStr]\n",
|
| 830 |
+
" else:\n",
|
| 831 |
+
" #all files under directory\n",
|
| 832 |
+
" if self.verbose:\n",
|
| 833 |
+
" print(\"got all files under directory from path\")\n",
|
| 834 |
+
" dtexts, dnames = getFileContent(fPaths[0])\n",
|
| 835 |
+
" if self.verbose:\n",
|
| 836 |
+
" print(\"found {} files\".format(len(dtexts)))\n",
|
| 837 |
+
" else:\n",
|
| 838 |
+
" #list of files\n",
|
| 839 |
+
" if self.verbose: \n",
|
| 840 |
+
" print(\"got list of files from path\")\n",
|
| 841 |
+
" dnames = fPaths\n",
|
| 842 |
+
" dtexts = list(map(getOneFileContent, fpaths))\n",
|
| 843 |
+
" if self.verbose:\n",
|
| 844 |
+
" print(\"found {} files\".format(len(dtexts)))\n",
|
| 845 |
+
"\n",
|
| 846 |
+
" ndocs = (dtexts, dnames)\t\n",
|
| 847 |
+
" if self.verbose:\n",
|
| 848 |
+
" print(\"docs\")\n",
|
| 849 |
+
" for dn, dt in zip(dnames, dtexts):\n",
|
| 850 |
+
" print(dn + \"\\t\" + dt[:40])\n",
|
| 851 |
+
"\n",
|
| 852 |
+
" return ndocs\n",
|
| 853 |
+
"\n",
|
| 854 |
+
" def generateFragmentsFromFiles(self, fpaths):\n",
|
| 855 |
+
" \"\"\"\n",
|
| 856 |
+
" fragments documents into whole doc, paragraph or passages\n",
|
| 857 |
+
" \"\"\"\n",
|
| 858 |
+
" dtexts, dnames = self.loadDocs(fpaths)\n",
|
| 859 |
+
" return self.generateFragments(dtexts, dnames)\n",
|
| 860 |
+
"\n",
|
| 861 |
+
"\n",
|
| 862 |
+
" def generateFragmentsFromNamedDocs(self, ndocs):\n",
|
| 863 |
+
" \"\"\"\n",
|
| 864 |
+
" fragments documents into whole doc, paragraph or passages\n",
|
| 865 |
+
" \"\"\"\n",
|
| 866 |
+
" dtexts = list(map(lambda nd : nd[1], ndocs))\n",
|
| 867 |
+
" dnames = list(map(lambda nd : nd[0], ndocs))\n",
|
| 868 |
+
" #for i in range(len(dtexts)):\n",
|
| 869 |
+
" #\tprint(dnames[i])\n",
|
| 870 |
+
" #\tprint(dtexts[i][:40])\n",
|
| 871 |
+
" return self.generateFragments(dtexts, dnames)\n",
|
| 872 |
+
"\n",
|
| 873 |
+
" def generateFragments(self, dtexts, dnames):\n",
|
| 874 |
+
" \"\"\"\n",
|
| 875 |
+
" fragments documents into whole doc, paragraph or passages\n",
|
| 876 |
+
" \"\"\"\n",
|
| 877 |
+
" if self.level == \"para\" or self.level == \"passage\":\n",
|
| 878 |
+
" #split paras\n",
|
| 879 |
+
" dptexts = list()\n",
|
| 880 |
+
" dpnames = list()\n",
|
| 881 |
+
" for dt, dn in zip(dtexts, dnames):\n",
|
| 882 |
+
" paras = getParas(dt, self.minParNl)\n",
|
| 883 |
+
" if self.verbose:\n",
|
| 884 |
+
" print(dn)\n",
|
| 885 |
+
" print(\"no of paras {}\".format(len(paras)))\n",
|
| 886 |
+
" dptexts.extend(paras)\n",
|
| 887 |
+
" pnames = list(map(lambda i : dn + \":\" + str(i), range(len(paras))))\n",
|
| 888 |
+
" dpnames.extend(pnames)\n",
|
| 889 |
+
" dtexts = dptexts\n",
|
| 890 |
+
" dnames = dpnames\n",
|
| 891 |
+
"\n",
|
| 892 |
+
" if self.level == \"passage\":\n",
|
| 893 |
+
" #split each para into passages\n",
|
| 894 |
+
" dptexts = list()\n",
|
| 895 |
+
" dpnames = list()\n",
|
| 896 |
+
" for dt, dn in zip(dtexts, dnames):\n",
|
| 897 |
+
" sents = sent_tokenize(dt.strip())\t\t\t\n",
|
| 898 |
+
" if self.verbose:\n",
|
| 899 |
+
" print(dn)\n",
|
| 900 |
+
" print(\"no of sentences {}\".format(len(sents)))\n",
|
| 901 |
+
" span = self.passSize\n",
|
| 902 |
+
" if len(sents) <= span:\n",
|
| 903 |
+
" pass\n",
|
| 904 |
+
" else:\n",
|
| 905 |
+
" for i in range(0, len(sents) - span, 1):\n",
|
| 906 |
+
" dptext = None\n",
|
| 907 |
+
" for j in range(span):\n",
|
| 908 |
+
" if dptext is None:\n",
|
| 909 |
+
" dptext = sents[i + j] + \". \"\n",
|
| 910 |
+
" else:\n",
|
| 911 |
+
" dptext = dptext + sents[i + j] + \". \" \n",
|
| 912 |
+
" dpname = dn + \":\" + str(i)\n",
|
| 913 |
+
" dptexts.append(dptext)\n",
|
| 914 |
+
" dpnames.append(dpname)\n",
|
| 915 |
+
"\n",
|
| 916 |
+
" dtexts = dptexts\n",
|
| 917 |
+
" dnames = dpnames\n",
|
| 918 |
+
"\n",
|
| 919 |
+
" self.fragments = list(zip(dnames, dtexts))\n",
|
| 920 |
+
" #if self.verbose:\n",
|
| 921 |
+
" #\tprint(\"num fragments {}\".format(len(self.fragments)))\n",
|
| 922 |
+
" return self.fragments\n",
|
| 923 |
+
"\n",
|
| 924 |
+
" def showFragments(self):\n",
|
| 925 |
+
" \"\"\"\n",
|
| 926 |
+
" show fragments\n",
|
| 927 |
+
" \"\"\"\n",
|
| 928 |
+
" print(\"showing all \" + self.level + \" for the first 40 characters\")\n",
|
| 929 |
+
" for dn, dt in self.fragments:\n",
|
| 930 |
+
" print(dn + \"\\t\" + dt[:40])\n",
|
| 931 |
+
"\n",
|
| 932 |
+
" def isDocLevel(self):\n",
|
| 933 |
+
" \"\"\"\n",
|
| 934 |
+
" true if fragment is at doc level\n",
|
| 935 |
+
" \"\"\"\n",
|
| 936 |
+
" return self.level != \"para\" and self.level != \"passage\"\n",
|
| 937 |
+
"\n",
|
| 938 |
+
"# clean doc to create term array\n",
|
| 939 |
+
"def clean(doc, preprocessor, verbose):\n",
|
| 940 |
+
" \"\"\"\n",
|
| 941 |
+
" text pre process\n",
|
| 942 |
+
" \"\"\"\n",
|
| 943 |
+
" if verbose:\n",
|
| 944 |
+
" print (\"--raw doc\")\n",
|
| 945 |
+
" print (doc)\n",
|
| 946 |
+
" #print \"next clean\"\n",
|
| 947 |
+
" doc = preprocessor.removeNonAsciiFromText(doc)\n",
|
| 948 |
+
" words = preprocessor.tokenize(doc)\n",
|
| 949 |
+
" words = preprocessor.allow(words)\n",
|
| 950 |
+
" words = preprocessor.toLowercase(words)\n",
|
| 951 |
+
" words = preprocessor.removeStopwords(words)\n",
|
| 952 |
+
" words = preprocessor.removeShortWords(words, 3)\n",
|
| 953 |
+
" words = preprocessor.removePunctuation(words)\n",
|
| 954 |
+
" words = preprocessor.lemmatizeWords(words)\n",
|
| 955 |
+
" #words = preprocessor.removeNonAscii(words)\n",
|
| 956 |
+
" if verbose:\n",
|
| 957 |
+
" print (\"--after pre processing\")\n",
|
| 958 |
+
" print (words)\n",
|
| 959 |
+
" return words\n",
|
| 960 |
+
"\n",
|
| 961 |
+
"# get sentences\n",
|
| 962 |
+
"def getSentences(filePath):\n",
|
| 963 |
+
" \"\"\"\n",
|
| 964 |
+
" text pre process\n",
|
| 965 |
+
" \"\"\"\n",
|
| 966 |
+
" with open(filePath, 'r') as contentFile:\n",
|
| 967 |
+
" content = contentFile.read()\n",
|
| 968 |
+
" sentences = content.split('.')\n",
|
| 969 |
+
" return sentences\n",
|
| 970 |
+
"\n",
|
| 971 |
+
"def getParas(text, minParNl=2):\n",
|
| 972 |
+
" \"\"\"\n",
|
| 973 |
+
" split into paras\n",
|
| 974 |
+
" \"\"\"\n",
|
| 975 |
+
" regx = \"\\n+\" if minParNl == 1 else \"\\n{2,}\"\n",
|
| 976 |
+
" paras = re.split(regx, text.replace(\"\\r\\n\", \"\\n\"))\n",
|
| 977 |
+
" return paras\n"
|
| 978 |
+
]
|
| 979 |
+
}
|
| 980 |
+
],
|
| 981 |
+
"metadata": {
|
| 982 |
+
"kernelspec": {
|
| 983 |
+
"display_name": "Python 3 (ipykernel)",
|
| 984 |
+
"language": "python",
|
| 985 |
+
"name": "python3"
|
| 986 |
+
},
|
| 987 |
+
"language_info": {
|
| 988 |
+
"codemirror_mode": {
|
| 989 |
+
"name": "ipython",
|
| 990 |
+
"version": 3
|
| 991 |
+
},
|
| 992 |
+
"file_extension": ".py",
|
| 993 |
+
"mimetype": "text/x-python",
|
| 994 |
+
"name": "python",
|
| 995 |
+
"nbconvert_exporter": "python",
|
| 996 |
+
"pygments_lexer": "ipython3",
|
| 997 |
+
"version": "3.9.12"
|
| 998 |
+
}
|
| 999 |
+
},
|
| 1000 |
+
"nbformat": 4,
|
| 1001 |
+
"nbformat_minor": 5
|
| 1002 |
+
}
|
lib/.ipynb_checkpoints/util-checkpoint.ipynb
ADDED
|
@@ -0,0 +1,2141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "031d69ef",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"from random import randint\n",
|
| 13 |
+
"import random\n",
|
| 14 |
+
"import time\n",
|
| 15 |
+
"import uuid\n",
|
| 16 |
+
"from datetime import datetime\n",
|
| 17 |
+
"import math\n",
|
| 18 |
+
"import numpy as np\n",
|
| 19 |
+
"import pandas as pd\n",
|
| 20 |
+
"import matplotlib.pyplot as plt\n",
|
| 21 |
+
"import numpy as np\n",
|
| 22 |
+
"import logging\n",
|
| 23 |
+
"import logging.handlers\n",
|
| 24 |
+
"import pickle\n",
|
| 25 |
+
"from contextlib import contextmanager\n",
|
| 26 |
+
"\n",
|
| 27 |
+
"tokens = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\",\n",
|
| 28 |
+
" \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
|
| 29 |
+
"numTokens = tokens[:10]\n",
|
| 30 |
+
"alphaTokens = tokens[10:36]\n",
|
| 31 |
+
"loCaseChars = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
|
| 32 |
+
"\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"typeInt = \"int\"\n",
|
| 35 |
+
"typeFloat = \"float\"\n",
|
| 36 |
+
"typeString = \"string\"\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"secInMinute = 60\n",
|
| 39 |
+
"secInHour = 60 * 60\n",
|
| 40 |
+
"secInDay = 24 * secInHour\n",
|
| 41 |
+
"secInWeek = 7 * secInDay\n",
|
| 42 |
+
"secInYear = 365 * secInDay\n",
|
| 43 |
+
"secInMonth = secInYear / 12\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"minInHour = 60\n",
|
| 46 |
+
"minInDay = 24 * minInHour\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"ftPerYard = 3\n",
|
| 49 |
+
"ftPerMile = ftPerYard * 1760\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"def genID(size):\n",
|
| 53 |
+
" \"\"\"\n",
|
| 54 |
+
" generates ID\n",
|
| 55 |
+
"\n",
|
| 56 |
+
" Parameters\n",
|
| 57 |
+
" size : size of ID\n",
|
| 58 |
+
" \"\"\"\n",
|
| 59 |
+
" id = \"\"\n",
|
| 60 |
+
" for i in range(size):\n",
|
| 61 |
+
" id = id + selectRandomFromList(tokens)\n",
|
| 62 |
+
" return id\n",
|
| 63 |
+
"\n",
|
| 64 |
+
"def genIdList(numId, idSize):\n",
|
| 65 |
+
" \"\"\"\n",
|
| 66 |
+
" generate list of IDs\n",
|
| 67 |
+
"\n",
|
| 68 |
+
" Parameters:\n",
|
| 69 |
+
" numId: number of Ids\n",
|
| 70 |
+
" idSize: ID size\n",
|
| 71 |
+
" \"\"\"\n",
|
| 72 |
+
" iDs = []\n",
|
| 73 |
+
" for i in range(numId):\n",
|
| 74 |
+
" iDs.append(genID(idSize))\n",
|
| 75 |
+
" return iDs\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"def genNumID(size):\n",
|
| 78 |
+
" \"\"\"\n",
|
| 79 |
+
" generates ID consisting of digits onl\n",
|
| 80 |
+
"\n",
|
| 81 |
+
" Parameters\n",
|
| 82 |
+
" size : size of ID\n",
|
| 83 |
+
" \"\"\"\n",
|
| 84 |
+
" id = \"\"\n",
|
| 85 |
+
" for i in range(size):\n",
|
| 86 |
+
" id = id + selectRandomFromList(numTokens)\n",
|
| 87 |
+
" return id\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"def genLowCaseID(size):\n",
|
| 90 |
+
" \"\"\"\n",
|
| 91 |
+
" generates ID consisting of lower case chars\n",
|
| 92 |
+
"\n",
|
| 93 |
+
" Parameters\n",
|
| 94 |
+
" size : size of ID\n",
|
| 95 |
+
" \"\"\"\n",
|
| 96 |
+
" id = \"\"\n",
|
| 97 |
+
" for i in range(size):\n",
|
| 98 |
+
" id = id + selectRandomFromList(loCaseChars)\n",
|
| 99 |
+
" return id\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"def genNumIdList(numId, idSize):\n",
|
| 102 |
+
" \"\"\"\n",
|
| 103 |
+
" generate list of numeric IDs\n",
|
| 104 |
+
"\n",
|
| 105 |
+
" Parameters:\n",
|
| 106 |
+
" numId: number of Ids\n",
|
| 107 |
+
" idSize: ID size\n",
|
| 108 |
+
" \"\"\"\n",
|
| 109 |
+
" iDs = []\n",
|
| 110 |
+
" for i in range(numId):\n",
|
| 111 |
+
" iDs.append(genNumID(idSize))\n",
|
| 112 |
+
" return iDs\n",
|
| 113 |
+
"\n",
|
| 114 |
+
"def genNameInitial():\n",
|
| 115 |
+
" \"\"\"\n",
|
| 116 |
+
" generate name initial\n",
|
| 117 |
+
" \"\"\"\n",
|
| 118 |
+
" return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens)\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"def genPhoneNum(arCode):\n",
|
| 121 |
+
" \"\"\"\n",
|
| 122 |
+
" generates phone number\n",
|
| 123 |
+
"\n",
|
| 124 |
+
" Parameters\n",
|
| 125 |
+
" arCode: area code\n",
|
| 126 |
+
" \"\"\"\n",
|
| 127 |
+
" phNum = genNumID(7)\n",
|
| 128 |
+
" return arCode + str(phNum)\n",
|
| 129 |
+
"\n",
|
| 130 |
+
"def selectRandomFromList(ldata):\n",
|
| 131 |
+
" \"\"\"\n",
|
| 132 |
+
" select an element randomly from a lis\n",
|
| 133 |
+
"\n",
|
| 134 |
+
" Parameters\n",
|
| 135 |
+
" ldata : list data\n",
|
| 136 |
+
" \"\"\"\n",
|
| 137 |
+
" return ldata[randint(0, len(ldata)-1)]\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"def selectOtherRandomFromList(ldata, cval):\n",
|
| 140 |
+
" \"\"\"\n",
|
| 141 |
+
" select an element randomly from a list excluding the given one\n",
|
| 142 |
+
"\n",
|
| 143 |
+
" Parameters\n",
|
| 144 |
+
" ldata : list data\n",
|
| 145 |
+
" cval : value to be excluded\n",
|
| 146 |
+
" \"\"\"\n",
|
| 147 |
+
" nval = selectRandomFromList(ldata)\n",
|
| 148 |
+
" while nval == cval:\n",
|
| 149 |
+
" nval = selectRandomFromList(ldata)\n",
|
| 150 |
+
" return nval\n",
|
| 151 |
+
"\n",
|
| 152 |
+
"def selectRandomSubListFromList(ldata, num):\n",
|
| 153 |
+
" \"\"\"\n",
|
| 154 |
+
" generates random sublist from a list without replacemment\n",
|
| 155 |
+
"\n",
|
| 156 |
+
" Parameters\n",
|
| 157 |
+
" ldata : list data\n",
|
| 158 |
+
" num : output list size\n",
|
| 159 |
+
" \"\"\"\n",
|
| 160 |
+
" assertLesser(num, len(ldata), \"size of sublist to be sampled greater than or equal to main list\")\n",
|
| 161 |
+
" i = randint(0, len(ldata)-1)\n",
|
| 162 |
+
" sel = ldata[i]\n",
|
| 163 |
+
" selSet = {i}\n",
|
| 164 |
+
" selList = [sel]\n",
|
| 165 |
+
" while (len(selSet) < num):\n",
|
| 166 |
+
" i = randint(0, len(ldata)-1)\n",
|
| 167 |
+
" if (i not in selSet):\n",
|
| 168 |
+
" sel = ldata[i]\n",
|
| 169 |
+
" selSet.add(i)\n",
|
| 170 |
+
" selList.append(sel)\t\t\n",
|
| 171 |
+
" return selList\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"def selectRandomSubListFromListWithRepl(ldata, num):\n",
|
| 174 |
+
" \"\"\"\n",
|
| 175 |
+
" generates random sublist from a list with replacemment\n",
|
| 176 |
+
"\n",
|
| 177 |
+
" Parameters\n",
|
| 178 |
+
" ldata : list data\n",
|
| 179 |
+
" num : output list size\n",
|
| 180 |
+
" \"\"\"\n",
|
| 181 |
+
" return list(map(lambda i : selectRandomFromList(ldata), range(num)))\n",
|
| 182 |
+
"\n",
|
| 183 |
+
"def selectRandomFromDict(ddata):\n",
|
| 184 |
+
" \"\"\"\n",
|
| 185 |
+
" select an element randomly from a dictionary\n",
|
| 186 |
+
"\n",
|
| 187 |
+
" Parameters\n",
|
| 188 |
+
" ddata : dictionary data\n",
|
| 189 |
+
" \"\"\"\n",
|
| 190 |
+
" dkeys = list(ddata.keys())\n",
|
| 191 |
+
" dk = selectRandomFromList(dkeys)\n",
|
| 192 |
+
" el = (dk, ddata[dk])\n",
|
| 193 |
+
" return el\n",
|
| 194 |
+
"\n",
|
| 195 |
+
"def setListRandomFromList(ldata, ldataRepl):\n",
|
| 196 |
+
" \"\"\"\n",
|
| 197 |
+
" sets some elents in the first list randomly with elements from the second list\n",
|
| 198 |
+
"\n",
|
| 199 |
+
" Parameters\n",
|
| 200 |
+
" ldata : list data\n",
|
| 201 |
+
" ldataRepl : list with replacement data\n",
|
| 202 |
+
" \"\"\"\n",
|
| 203 |
+
" l = len(ldata)\n",
|
| 204 |
+
" selSet = set()\n",
|
| 205 |
+
" for d in ldataRepl:\n",
|
| 206 |
+
" i = randint(0, l-1)\n",
|
| 207 |
+
" while i in selSet:\n",
|
| 208 |
+
" i = randint(0, l-1)\n",
|
| 209 |
+
" ldata[i] = d\n",
|
| 210 |
+
" selSet.add(i)\n",
|
| 211 |
+
"\n",
|
| 212 |
+
"def genIpAddress():\n",
|
| 213 |
+
" \"\"\"\n",
|
| 214 |
+
" generates IP address\n",
|
| 215 |
+
" \"\"\"\n",
|
| 216 |
+
" i1 = randint(0,256)\n",
|
| 217 |
+
" i2 = randint(0,256)\n",
|
| 218 |
+
" i3 = randint(0,256)\n",
|
| 219 |
+
" i4 = randint(0,256)\n",
|
| 220 |
+
" ip = \"%d.%d.%d.%d\" %(i1,i2,i3,i4)\n",
|
| 221 |
+
" return ip\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"def curTimeMs():\n",
|
| 224 |
+
" \"\"\"\n",
|
| 225 |
+
" current time in ms\n",
|
| 226 |
+
" \"\"\"\n",
|
| 227 |
+
" return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000)\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"def secDegPolyFit(x1, y1, x2, y2, x3, y3):\n",
|
| 230 |
+
" \"\"\"\n",
|
| 231 |
+
" second deg polynomial \t\n",
|
| 232 |
+
"\n",
|
| 233 |
+
" Parameters\n",
|
| 234 |
+
" x1 : 1st point x\n",
|
| 235 |
+
" y1 : 1st point y\n",
|
| 236 |
+
" x2 : 2nd point x\n",
|
| 237 |
+
" y2 : 2nd point y\n",
|
| 238 |
+
" x3 : 3rd point x\n",
|
| 239 |
+
" y3 : 3rd point y\n",
|
| 240 |
+
" \"\"\"\n",
|
| 241 |
+
" t = (y1 - y2) / (x1 - x2)\n",
|
| 242 |
+
" a = t - (y2 - y3) / (x2 - x3)\n",
|
| 243 |
+
" a = a / (x1 - x3)\n",
|
| 244 |
+
" b = t - a * (x1 + x2)\n",
|
| 245 |
+
" c = y1 - a * x1 * x1 - b * x1\n",
|
| 246 |
+
" return (a, b, c)\n",
|
| 247 |
+
"\n",
|
| 248 |
+
"def range_limit(val, minv, maxv):\n",
|
| 249 |
+
" \"\"\"\n",
|
| 250 |
+
" range limit a value\n",
|
| 251 |
+
"\n",
|
| 252 |
+
" Parameters\n",
|
| 253 |
+
" val : data value\n",
|
| 254 |
+
" minv : minimum\n",
|
| 255 |
+
" maxv : maximum\n",
|
| 256 |
+
" \"\"\"\n",
|
| 257 |
+
" if (val < minv):\n",
|
| 258 |
+
" val = minv\n",
|
| 259 |
+
" elif (val > maxv):\n",
|
| 260 |
+
" val = maxv\n",
|
| 261 |
+
" return val\t\n",
|
| 262 |
+
"\n",
|
| 263 |
+
"def isInRange(val, minv, maxv):\n",
|
| 264 |
+
" \"\"\"\n",
|
| 265 |
+
" checks if within range\n",
|
| 266 |
+
"\n",
|
| 267 |
+
" Parameters\n",
|
| 268 |
+
" val : data value\n",
|
| 269 |
+
" minv : minimum\n",
|
| 270 |
+
" maxv : maximum\n",
|
| 271 |
+
" \"\"\"\n",
|
| 272 |
+
" return val >= minv and val <= maxv\n",
|
| 273 |
+
"\n",
|
| 274 |
+
"def stripFileLines(filePath, offset):\n",
|
| 275 |
+
" \"\"\"\n",
|
| 276 |
+
" strips number of chars from both ends\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" Parameters\n",
|
| 279 |
+
" filePath : file path\n",
|
| 280 |
+
" offset : offset from both ends of line \n",
|
| 281 |
+
" \"\"\"\n",
|
| 282 |
+
" fp = open(filePath, \"r\")\n",
|
| 283 |
+
" for line in fp:\n",
|
| 284 |
+
" stripped = line[offset:len(line) - 1 - offset]\n",
|
| 285 |
+
" print (stripped)\n",
|
| 286 |
+
" fp.close()\n",
|
| 287 |
+
"\n",
|
| 288 |
+
"def genLatLong(lat1, long1, lat2, long2):\n",
|
| 289 |
+
" \"\"\"\n",
|
| 290 |
+
" generate lat log within limits\n",
|
| 291 |
+
"\n",
|
| 292 |
+
" Parameters\n",
|
| 293 |
+
" lat1 : lat of 1st point\n",
|
| 294 |
+
" long1 : long of 1st point\n",
|
| 295 |
+
" lat2 : lat of 2nd point\n",
|
| 296 |
+
" long2 : long of 2nd point\n",
|
| 297 |
+
" \"\"\"\n",
|
| 298 |
+
" lat = lat1 + (lat2 - lat1) * random.random()\n",
|
| 299 |
+
" longg = long1 + (long2 - long1) * random.random()\n",
|
| 300 |
+
" return (lat, longg)\n",
|
| 301 |
+
"\n",
|
| 302 |
+
"def geoDistance(lat1, long1, lat2, long2):\n",
|
| 303 |
+
" \"\"\"\n",
|
| 304 |
+
" find geo distance in ft\n",
|
| 305 |
+
"\n",
|
| 306 |
+
" Parameters\n",
|
| 307 |
+
" lat1 : lat of 1st point\n",
|
| 308 |
+
" long1 : long of 1st point\n",
|
| 309 |
+
" lat2 : lat of 2nd point\n",
|
| 310 |
+
" long2 : long of 2nd point\n",
|
| 311 |
+
" \"\"\"\n",
|
| 312 |
+
" latDiff = math.radians(lat1 - lat2)\n",
|
| 313 |
+
" longDiff = math.radians(long1 - long2)\n",
|
| 314 |
+
" l1 = math.sin(latDiff/2.0)\n",
|
| 315 |
+
" l2 = math.sin(longDiff/2.0)\n",
|
| 316 |
+
" l3 = math.cos(math.radians(lat1))\n",
|
| 317 |
+
" l4 = math.cos(math.radians(lat2))\n",
|
| 318 |
+
" a = l1 * l1 + l3 * l4 * l2 * l2\n",
|
| 319 |
+
" l5 = math.sqrt(a)\n",
|
| 320 |
+
" l6 = math.sqrt(1.0 - a)\n",
|
| 321 |
+
" c = 2.0 * math.atan2(l5, l6)\n",
|
| 322 |
+
" r = 6371008.8 * 3.280840\n",
|
| 323 |
+
" return c * r\n",
|
| 324 |
+
"\n",
|
| 325 |
+
"def minLimit(val, limit):\n",
|
| 326 |
+
" \"\"\"\n",
|
| 327 |
+
" min limit\n",
|
| 328 |
+
" Parameters\n",
|
| 329 |
+
" \"\"\"\n",
|
| 330 |
+
" if (val < limit):\n",
|
| 331 |
+
" val = limit\n",
|
| 332 |
+
" return val;\n",
|
| 333 |
+
"\n",
|
| 334 |
+
"def maxLimit(val, limit):\n",
|
| 335 |
+
" \"\"\"\n",
|
| 336 |
+
" max limit\n",
|
| 337 |
+
" Parameters\n",
|
| 338 |
+
" \"\"\"\n",
|
| 339 |
+
" if (val > limit):\n",
|
| 340 |
+
" val = limit\n",
|
| 341 |
+
" return val;\n",
|
| 342 |
+
"\n",
|
| 343 |
+
"def rangeSample(val, minLim, maxLim):\n",
|
| 344 |
+
" \"\"\"\n",
|
| 345 |
+
" if out side range sample within range\n",
|
| 346 |
+
"\n",
|
| 347 |
+
" Parameters\n",
|
| 348 |
+
" val : value\n",
|
| 349 |
+
" minLim : minimum\n",
|
| 350 |
+
" maxLim : maximum\n",
|
| 351 |
+
" \"\"\"\n",
|
| 352 |
+
" if val < minLim or val > maxLim:\n",
|
| 353 |
+
" val = randint(minLim, maxLim)\n",
|
| 354 |
+
" return val\n",
|
| 355 |
+
"\n",
|
| 356 |
+
"def genRandomIntListWithinRange(size, minLim, maxLim):\n",
|
| 357 |
+
" \"\"\"\n",
|
| 358 |
+
" random unique list of integers within range\n",
|
| 359 |
+
"\n",
|
| 360 |
+
" Parameters\n",
|
| 361 |
+
" size : size of returned list\n",
|
| 362 |
+
" minLim : minimum\n",
|
| 363 |
+
" maxLim : maximum\n",
|
| 364 |
+
" \"\"\"\n",
|
| 365 |
+
" values = set()\n",
|
| 366 |
+
" for i in range(size):\n",
|
| 367 |
+
" val = randint(minLim, maxLim)\n",
|
| 368 |
+
" while val not in values:\n",
|
| 369 |
+
" values.add(val)\n",
|
| 370 |
+
" return list(values)\n",
|
| 371 |
+
"\n",
|
| 372 |
+
"def preturbScalar(value, vrange):\n",
|
| 373 |
+
" \"\"\"\n",
|
| 374 |
+
" preturbs a mutiplicative value within range\n",
|
| 375 |
+
"\n",
|
| 376 |
+
" Parameters\n",
|
| 377 |
+
" value : data value\n",
|
| 378 |
+
" vrange : value delta fraction\n",
|
| 379 |
+
" \"\"\"\n",
|
| 380 |
+
" scale = 1.0 - vrange + 2 * vrange * random.random() \n",
|
| 381 |
+
" return value * scale\n",
|
| 382 |
+
"\n",
|
| 383 |
+
"def preturbScalarAbs(value, vrange):\n",
|
| 384 |
+
" \"\"\"\n",
|
| 385 |
+
" preturbs an absolute value within range\n",
|
| 386 |
+
"\n",
|
| 387 |
+
" Parameters\n",
|
| 388 |
+
" value : data value\n",
|
| 389 |
+
" vrange : value delta absolute\n",
|
| 390 |
+
" \"\"\"\n",
|
| 391 |
+
" delta = - vrange + 2.0 * vrange * random.random() \n",
|
| 392 |
+
" return value + delta\n",
|
| 393 |
+
"\n",
|
| 394 |
+
"def preturbVector(values, vrange):\n",
|
| 395 |
+
" \"\"\"\n",
|
| 396 |
+
" preturbs a list within range\n",
|
| 397 |
+
"\n",
|
| 398 |
+
" Parameters\n",
|
| 399 |
+
" values : list data\n",
|
| 400 |
+
" vrange : value delta fraction\n",
|
| 401 |
+
" \"\"\"\n",
|
| 402 |
+
" nValues = list(map(lambda va: preturbScalar(va, vrange), values))\n",
|
| 403 |
+
" return nValues\n",
|
| 404 |
+
"\n",
|
| 405 |
+
"def randomShiftVector(values, smin, smax):\n",
|
| 406 |
+
" \"\"\"\n",
|
| 407 |
+
" shifts a list by a random quanity with a range\n",
|
| 408 |
+
"\n",
|
| 409 |
+
" Parameters\n",
|
| 410 |
+
" values : list data\n",
|
| 411 |
+
" smin : samplinf minimum\n",
|
| 412 |
+
" smax : sampling maximum\n",
|
| 413 |
+
" \"\"\"\n",
|
| 414 |
+
" shift = np.random.uniform(smin, smax)\n",
|
| 415 |
+
" return list(map(lambda va: va + shift, values))\n",
|
| 416 |
+
"\n",
|
| 417 |
+
"def floatRange(beg, end, incr):\n",
|
| 418 |
+
" \"\"\"\n",
|
| 419 |
+
" generates float range\n",
|
| 420 |
+
"\n",
|
| 421 |
+
" Parameters\n",
|
| 422 |
+
" beg :range begin\n",
|
| 423 |
+
" end: range end\n",
|
| 424 |
+
" incr : range increment\n",
|
| 425 |
+
" \"\"\"\n",
|
| 426 |
+
" return list(np.arange(beg, end, incr))\n",
|
| 427 |
+
"\n",
|
| 428 |
+
"def shuffle(values, *numShuffles):\n",
|
| 429 |
+
" \"\"\"\n",
|
| 430 |
+
" in place shuffling with swap of pairs\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" Parameters\n",
|
| 433 |
+
" values : list data\n",
|
| 434 |
+
" numShuffles : parameter list for number of shuffles\n",
|
| 435 |
+
" \"\"\"\n",
|
| 436 |
+
" size = len(values)\n",
|
| 437 |
+
" if len(numShuffles) == 0:\n",
|
| 438 |
+
" numShuffle = int(size / 2)\n",
|
| 439 |
+
" elif len(numShuffles) == 1:\n",
|
| 440 |
+
" numShuffle = numShuffles[0]\n",
|
| 441 |
+
" else:\n",
|
| 442 |
+
" numShuffle = randint(numShuffles[0], numShuffles[1])\n",
|
| 443 |
+
" print(\"numShuffle {}\".format(numShuffle))\n",
|
| 444 |
+
" for i in range(numShuffle):\n",
|
| 445 |
+
" first = random.randint(0, size - 1)\n",
|
| 446 |
+
" second = random.randint(0, size - 1)\n",
|
| 447 |
+
" while first == second:\n",
|
| 448 |
+
" second = random.randint(0, size - 1)\n",
|
| 449 |
+
" tmp = values[first]\n",
|
| 450 |
+
" values[first] = values[second]\n",
|
| 451 |
+
" values[second] = tmp\n",
|
| 452 |
+
"\n",
|
| 453 |
+
"\n",
|
| 454 |
+
"def splitList(itms, numGr):\n",
|
| 455 |
+
" \"\"\"\n",
|
| 456 |
+
" splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen\n",
|
| 457 |
+
"\n",
|
| 458 |
+
" Parameters\n",
|
| 459 |
+
" itms ; list of values\t\t\n",
|
| 460 |
+
" numGr : no of groups\n",
|
| 461 |
+
" \"\"\"\n",
|
| 462 |
+
" tcount = len(itms)\n",
|
| 463 |
+
" cItems = list(itms)\n",
|
| 464 |
+
" sz = int(len(cItems) / numGr)\n",
|
| 465 |
+
" groups = list()\n",
|
| 466 |
+
" count = 0\n",
|
| 467 |
+
" for i in range(numGr):\n",
|
| 468 |
+
" if (i == numGr - 1):\n",
|
| 469 |
+
" csz = tcount - count\n",
|
| 470 |
+
" else:\n",
|
| 471 |
+
" csz = sz + randint(-2, 2)\n",
|
| 472 |
+
" count += csz\n",
|
| 473 |
+
" gr = list()\n",
|
| 474 |
+
" for j in range(csz):\n",
|
| 475 |
+
" it = selectRandomFromList(cItems)\n",
|
| 476 |
+
" gr.append(it)\n",
|
| 477 |
+
" cItems.remove(it)\n",
|
| 478 |
+
" groups.append(gr)\n",
|
| 479 |
+
" return groups\n",
|
| 480 |
+
"\n",
|
| 481 |
+
"def multVector(values, vrange):\n",
|
| 482 |
+
" \"\"\"\n",
|
| 483 |
+
" multiplies a list within value range\n",
|
| 484 |
+
"\n",
|
| 485 |
+
" Parameters\n",
|
| 486 |
+
" values : list of values\n",
|
| 487 |
+
" vrange : fraction of vaue to be used to update\n",
|
| 488 |
+
" \"\"\"\n",
|
| 489 |
+
" scale = 1.0 - vrange + 2 * vrange * random.random()\n",
|
| 490 |
+
" nValues = list(map(lambda va: va * scale, values))\n",
|
| 491 |
+
" return nValues\n",
|
| 492 |
+
"\n",
|
| 493 |
+
"def weightedAverage(values, weights):\n",
|
| 494 |
+
" \"\"\"\n",
|
| 495 |
+
" calculates weighted average\n",
|
| 496 |
+
"\n",
|
| 497 |
+
" Parameters\n",
|
| 498 |
+
" values : list of values\n",
|
| 499 |
+
" weights : list of weights\n",
|
| 500 |
+
" \"\"\"\t\t\n",
|
| 501 |
+
" assert len(values) == len(weights), \"values and weights should be same size\"\n",
|
| 502 |
+
" vw = zip(values, weights)\n",
|
| 503 |
+
" wva = list(map(lambda e : e[0] * e[1], vw))\n",
|
| 504 |
+
" #wa = sum(x * y for x, y in vw) / sum(weights)\n",
|
| 505 |
+
" wav = sum(wva) / sum(weights)\n",
|
| 506 |
+
" return wav\n",
|
| 507 |
+
"\n",
|
| 508 |
+
"def extractFields(line, delim, keepIndices):\n",
|
| 509 |
+
" \"\"\"\n",
|
| 510 |
+
" breaks a line into fields and keeps only specified fileds and returns new line\n",
|
| 511 |
+
"\n",
|
| 512 |
+
" Parameters\n",
|
| 513 |
+
" line ; deli separated string\n",
|
| 514 |
+
" delim : delemeter\n",
|
| 515 |
+
" keepIndices : list of indexes to fields to be retained\n",
|
| 516 |
+
" \"\"\"\n",
|
| 517 |
+
" items = line.split(delim)\n",
|
| 518 |
+
" newLine = []\n",
|
| 519 |
+
" for i in keepIndices:\n",
|
| 520 |
+
" newLine.append(line[i])\n",
|
| 521 |
+
" return delim.join(newLine)\n",
|
| 522 |
+
"\n",
|
| 523 |
+
"def remFields(line, delim, remIndices):\n",
|
| 524 |
+
" \"\"\"\n",
|
| 525 |
+
" removes fields from delim separated string\n",
|
| 526 |
+
"\n",
|
| 527 |
+
" Parameters\n",
|
| 528 |
+
" line ; delemeter separated string\n",
|
| 529 |
+
" delim : delemeter\n",
|
| 530 |
+
" remIndices : list of indexes to fields to be removed\n",
|
| 531 |
+
" \"\"\"\n",
|
| 532 |
+
" items = line.split(delim)\n",
|
| 533 |
+
" newLine = []\n",
|
| 534 |
+
" for i in range(len(items)):\n",
|
| 535 |
+
" if not arrayContains(remIndices, i):\n",
|
| 536 |
+
" newLine.append(line[i])\n",
|
| 537 |
+
" return delim.join(newLine)\n",
|
| 538 |
+
"\n",
|
| 539 |
+
"def extractList(data, indices):\n",
|
| 540 |
+
" \"\"\"\n",
|
| 541 |
+
" extracts list from another list, given indices\n",
|
| 542 |
+
"\n",
|
| 543 |
+
" Parameters\n",
|
| 544 |
+
" remIndices : list data\n",
|
| 545 |
+
" indices : list of indexes to fields to be retained\n",
|
| 546 |
+
" \"\"\"\n",
|
| 547 |
+
" if areAllFieldsIncluded(data, indices):\n",
|
| 548 |
+
" exList = data.copy()\n",
|
| 549 |
+
" #print(\"all indices\")\n",
|
| 550 |
+
" else:\n",
|
| 551 |
+
" exList = list()\n",
|
| 552 |
+
" le = len(data)\n",
|
| 553 |
+
" for i in indices:\n",
|
| 554 |
+
" assert i < le , \"index {} out of bound {}\".format(i, le)\n",
|
| 555 |
+
" exList.append(data[i])\n",
|
| 556 |
+
"\n",
|
| 557 |
+
" return exList\n",
|
| 558 |
+
"\n",
|
| 559 |
+
"def arrayContains(arr, item):\n",
|
| 560 |
+
" \"\"\"\n",
|
| 561 |
+
" checks if array contains an item \n",
|
| 562 |
+
"\n",
|
| 563 |
+
" Parameters\n",
|
| 564 |
+
" arr : list data\n",
|
| 565 |
+
" item : item to search\n",
|
| 566 |
+
" \"\"\"\n",
|
| 567 |
+
" contains = True\n",
|
| 568 |
+
" try:\n",
|
| 569 |
+
" arr.index(item)\n",
|
| 570 |
+
" except ValueError:\n",
|
| 571 |
+
" contains = False\n",
|
| 572 |
+
" return contains\n",
|
| 573 |
+
"\n",
|
| 574 |
+
"def strToIntArray(line, delim=\",\"):\n",
|
| 575 |
+
" \"\"\"\n",
|
| 576 |
+
" int array from delim separated string\n",
|
| 577 |
+
"\n",
|
| 578 |
+
" Parameters\n",
|
| 579 |
+
" line ; delemeter separated string\n",
|
| 580 |
+
" \"\"\"\n",
|
| 581 |
+
" arr = line.split(delim)\n",
|
| 582 |
+
" return [int(a) for a in arr]\n",
|
| 583 |
+
"\n",
|
| 584 |
+
"def strToFloatArray(line, delim=\",\"):\n",
|
| 585 |
+
" \"\"\"\n",
|
| 586 |
+
" float array from delim separated string\n",
|
| 587 |
+
"\n",
|
| 588 |
+
" Parameters\n",
|
| 589 |
+
" line ; delemeter separated string\n",
|
| 590 |
+
" \"\"\"\n",
|
| 591 |
+
" arr = line.split(delim)\n",
|
| 592 |
+
" return [float(a) for a in arr]\n",
|
| 593 |
+
"\n",
|
| 594 |
+
"def strListOrRangeToIntArray(line):\n",
|
| 595 |
+
" \"\"\"\n",
|
| 596 |
+
" int array from delim separated string or range\n",
|
| 597 |
+
"\n",
|
| 598 |
+
" Parameters\n",
|
| 599 |
+
" line ; delemeter separated string\n",
|
| 600 |
+
" \"\"\"\n",
|
| 601 |
+
" varr = line.split(\",\")\n",
|
| 602 |
+
" if (len(varr) > 1):\n",
|
| 603 |
+
" iarr = list(map(lambda v: int(v), varr))\n",
|
| 604 |
+
" else:\n",
|
| 605 |
+
" vrange = line.split(\":\")\n",
|
| 606 |
+
" if (len(vrange) == 2):\n",
|
| 607 |
+
" lo = int(vrange[0])\n",
|
| 608 |
+
" hi = int(vrange[1])\n",
|
| 609 |
+
" iarr = list(range(lo, hi+1))\n",
|
| 610 |
+
" else:\n",
|
| 611 |
+
" iarr = [int(line)]\n",
|
| 612 |
+
" return iarr\n",
|
| 613 |
+
"\n",
|
| 614 |
+
"def toStr(val, precision):\n",
|
| 615 |
+
" \"\"\"\n",
|
| 616 |
+
" converts any type to string\t\n",
|
| 617 |
+
"\n",
|
| 618 |
+
" Parameters\n",
|
| 619 |
+
" val : value\n",
|
| 620 |
+
" precision ; precision for float value\n",
|
| 621 |
+
" \"\"\"\n",
|
| 622 |
+
" if type(val) == float or type(val) == np.float64 or type(val) == np.float32:\n",
|
| 623 |
+
" format = \"%\" + \".%df\" %(precision)\n",
|
| 624 |
+
" sVal = format %(val)\n",
|
| 625 |
+
" else:\n",
|
| 626 |
+
" sVal = str(val)\n",
|
| 627 |
+
" return sVal\n",
|
| 628 |
+
"\n",
|
| 629 |
+
"def toStrFromList(values, precision, delim=\",\"):\n",
|
| 630 |
+
" \"\"\"\n",
|
| 631 |
+
" converts list of any type to delim separated string\n",
|
| 632 |
+
"\n",
|
| 633 |
+
" Parameters\n",
|
| 634 |
+
" values : list data\n",
|
| 635 |
+
" precision ; precision for float value\n",
|
| 636 |
+
" delim : delemeter\n",
|
| 637 |
+
" \"\"\"\n",
|
| 638 |
+
" sValues = list(map(lambda v: toStr(v, precision), values))\n",
|
| 639 |
+
" return delim.join(sValues)\n",
|
| 640 |
+
"\n",
|
| 641 |
+
"def toIntList(values):\n",
|
| 642 |
+
" \"\"\"\n",
|
| 643 |
+
" convert to int list\n",
|
| 644 |
+
"\n",
|
| 645 |
+
" Parameters\n",
|
| 646 |
+
" values : list data\n",
|
| 647 |
+
" \"\"\"\n",
|
| 648 |
+
" return list(map(lambda va: int(va), values))\n",
|
| 649 |
+
"\n",
|
| 650 |
+
"def toFloatList(values):\n",
|
| 651 |
+
" \"\"\"\n",
|
| 652 |
+
" convert to float list\n",
|
| 653 |
+
"\n",
|
| 654 |
+
" Parameters\n",
|
| 655 |
+
" values : list data\n",
|
| 656 |
+
" \"\"\"\n",
|
| 657 |
+
" return list(map(lambda va: float(va), values))\n",
|
| 658 |
+
"\n",
|
| 659 |
+
"def toStrList(values, precision=None):\n",
|
| 660 |
+
" \"\"\"\n",
|
| 661 |
+
" convert to string list\n",
|
| 662 |
+
"\n",
|
| 663 |
+
" Parameters\n",
|
| 664 |
+
" values : list data\n",
|
| 665 |
+
" precision ; precision for float value\n",
|
| 666 |
+
" \"\"\"\n",
|
| 667 |
+
" return list(map(lambda va: toStr(va, precision), values))\n",
|
| 668 |
+
"\n",
|
| 669 |
+
"def toIntFromBoolean(value):\n",
|
| 670 |
+
" \"\"\"\n",
|
| 671 |
+
" convert to int\n",
|
| 672 |
+
"\n",
|
| 673 |
+
" Parameters\n",
|
| 674 |
+
" value : boolean value\n",
|
| 675 |
+
" \"\"\"\n",
|
| 676 |
+
" ival = 1 if value else 0\n",
|
| 677 |
+
" return ival\n",
|
| 678 |
+
"\n",
|
| 679 |
+
"def typedValue(val, dtype=None):\n",
|
| 680 |
+
" \"\"\"\n",
|
| 681 |
+
" return typed value given string, discovers data type if not specified\n",
|
| 682 |
+
"\n",
|
| 683 |
+
" Parameters\n",
|
| 684 |
+
" val : value\n",
|
| 685 |
+
" dtype : data type\n",
|
| 686 |
+
" \"\"\"\n",
|
| 687 |
+
" tVal = None\n",
|
| 688 |
+
"\n",
|
| 689 |
+
" if dtype is not None:\n",
|
| 690 |
+
" if dtype == \"num\":\n",
|
| 691 |
+
" dtype = \"int\" if dtype.find(\".\") == -1 else \"float\"\n",
|
| 692 |
+
"\n",
|
| 693 |
+
" if dtype == \"int\":\n",
|
| 694 |
+
" tVal = int(val)\n",
|
| 695 |
+
" elif dtype == \"float\":\n",
|
| 696 |
+
" tVal = float(val)\n",
|
| 697 |
+
" elif dtype == \"bool\":\n",
|
| 698 |
+
" tVal = bool(val)\n",
|
| 699 |
+
" else:\n",
|
| 700 |
+
" tVal = val\n",
|
| 701 |
+
" else:\n",
|
| 702 |
+
" if type(val) == str:\n",
|
| 703 |
+
" lVal = val.lower()\n",
|
| 704 |
+
"\n",
|
| 705 |
+
" #int\n",
|
| 706 |
+
" done = True\n",
|
| 707 |
+
" try:\n",
|
| 708 |
+
" tVal = int(val)\n",
|
| 709 |
+
" except ValueError:\n",
|
| 710 |
+
" done = False\n",
|
| 711 |
+
"\n",
|
| 712 |
+
" #float\n",
|
| 713 |
+
" if not done:\n",
|
| 714 |
+
" done = True\n",
|
| 715 |
+
" try:\n",
|
| 716 |
+
" tVal = float(val)\n",
|
| 717 |
+
" except ValueError:\n",
|
| 718 |
+
" done = False\n",
|
| 719 |
+
"\n",
|
| 720 |
+
" #boolean\n",
|
| 721 |
+
" if not done:\n",
|
| 722 |
+
" done = True\n",
|
| 723 |
+
" if lVal == \"true\":\n",
|
| 724 |
+
" tVal = True\n",
|
| 725 |
+
" elif lVal == \"false\":\n",
|
| 726 |
+
" tVal = False\n",
|
| 727 |
+
" else:\n",
|
| 728 |
+
" done = False\n",
|
| 729 |
+
" #None\t\t\n",
|
| 730 |
+
" if not done:\n",
|
| 731 |
+
" if lVal == \"none\":\n",
|
| 732 |
+
" tVal = None\n",
|
| 733 |
+
" else:\n",
|
| 734 |
+
" tVal = val\n",
|
| 735 |
+
" else:\n",
|
| 736 |
+
" tVal = val\n",
|
| 737 |
+
"\n",
|
| 738 |
+
" return tVal\n",
|
| 739 |
+
"\n",
|
| 740 |
+
"def getAllFiles(dirPath):\n",
|
| 741 |
+
" \"\"\"\n",
|
| 742 |
+
" get all files recursively\n",
|
| 743 |
+
"\n",
|
| 744 |
+
" Parameters\n",
|
| 745 |
+
" dirPath : directory path\n",
|
| 746 |
+
" \"\"\"\n",
|
| 747 |
+
" filePaths = []\n",
|
| 748 |
+
" for (thisDir, subDirs, fileNames) in os.walk(dirPath):\n",
|
| 749 |
+
" for fileName in fileNames:\n",
|
| 750 |
+
" filePaths.append(os.path.join(thisDir, fileName))\n",
|
| 751 |
+
" filePaths.sort()\n",
|
| 752 |
+
" return filePaths\n",
|
| 753 |
+
"\n",
|
| 754 |
+
"def getFileContent(fpath, verbose=False):\n",
|
| 755 |
+
" \"\"\"\n",
|
| 756 |
+
" get file contents in directory\n",
|
| 757 |
+
"\n",
|
| 758 |
+
" Parameters\n",
|
| 759 |
+
" fpath ; directory path\n",
|
| 760 |
+
" verbose : verbosity flag\n",
|
| 761 |
+
" \"\"\"\n",
|
| 762 |
+
" # dcument list\n",
|
| 763 |
+
" docComplete = []\n",
|
| 764 |
+
" filePaths = getAllFiles(fpath)\n",
|
| 765 |
+
"\n",
|
| 766 |
+
" # read files\n",
|
| 767 |
+
" for filePath in filePaths:\n",
|
| 768 |
+
" if verbose:\n",
|
| 769 |
+
" print(\"next file \" + filePath)\n",
|
| 770 |
+
" with open(filePath, 'r') as contentFile:\n",
|
| 771 |
+
" content = contentFile.read()\n",
|
| 772 |
+
" docComplete.append(content)\n",
|
| 773 |
+
" return (docComplete, filePaths)\n",
|
| 774 |
+
"\n",
|
| 775 |
+
"def getOneFileContent(fpath):\n",
|
| 776 |
+
" \"\"\"\n",
|
| 777 |
+
" get one file contents\n",
|
| 778 |
+
"\n",
|
| 779 |
+
" Parameters\n",
|
| 780 |
+
" fpath : file path\n",
|
| 781 |
+
" \"\"\"\n",
|
| 782 |
+
" with open(fpath, 'r') as contentFile:\n",
|
| 783 |
+
" docStr = contentFile.read()\n",
|
| 784 |
+
" return docStr\n",
|
| 785 |
+
"\n",
|
| 786 |
+
"def getFileLines(dirPath, delim=\",\"):\n",
|
| 787 |
+
" \"\"\"\n",
|
| 788 |
+
" get lines from a file\n",
|
| 789 |
+
"\n",
|
| 790 |
+
" Parameters\n",
|
| 791 |
+
" dirPath : file path\n",
|
| 792 |
+
" delim : delemeter\n",
|
| 793 |
+
" \"\"\"\n",
|
| 794 |
+
" lines = list()\n",
|
| 795 |
+
" for li in fileRecGen(dirPath, delim):\n",
|
| 796 |
+
" lines.append(li)\n",
|
| 797 |
+
" return lines\n",
|
| 798 |
+
"\n",
|
| 799 |
+
"def getFileSampleLines(dirPath, percen, delim=\",\"):\n",
|
| 800 |
+
" \"\"\"\n",
|
| 801 |
+
" get sampled lines from a file\n",
|
| 802 |
+
"\n",
|
| 803 |
+
" Parameters\n",
|
| 804 |
+
" dirPath : file path\n",
|
| 805 |
+
" percen : sampling percentage\n",
|
| 806 |
+
" delim : delemeter\n",
|
| 807 |
+
" \"\"\"\n",
|
| 808 |
+
" lines = list()\n",
|
| 809 |
+
" for li in fileRecGen(dirPath, delim):\n",
|
| 810 |
+
" if randint(0, 100) < percen:\n",
|
| 811 |
+
" lines.append(li)\n",
|
| 812 |
+
" return lines\n",
|
| 813 |
+
"\n",
|
| 814 |
+
"def getFileColumnAsString(dirPath, index, delim=\",\"):\n",
|
| 815 |
+
" \"\"\"\n",
|
| 816 |
+
" get string column from a file\n",
|
| 817 |
+
"\n",
|
| 818 |
+
" Parameters\n",
|
| 819 |
+
" dirPath : file path\n",
|
| 820 |
+
" index : index\n",
|
| 821 |
+
" delim : delemeter\n",
|
| 822 |
+
" \"\"\"\n",
|
| 823 |
+
" fields = list()\n",
|
| 824 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 825 |
+
" fields.append(rec[index])\n",
|
| 826 |
+
" #print(fields)\t\n",
|
| 827 |
+
" return fields\n",
|
| 828 |
+
"\n",
|
| 829 |
+
"def getFileColumnsAsString(dirPath, indexes, delim=\",\"):\n",
|
| 830 |
+
" \"\"\"\n",
|
| 831 |
+
" get multiple string columns from a file\n",
|
| 832 |
+
"\n",
|
| 833 |
+
" Parameters\n",
|
| 834 |
+
" dirPath : file path\n",
|
| 835 |
+
" indexes : indexes of columns\n",
|
| 836 |
+
" delim : delemeter\n",
|
| 837 |
+
" \"\"\"\n",
|
| 838 |
+
" nindex = len(indexes)\n",
|
| 839 |
+
" columns = list(map(lambda i : list(), range(nindex)))\n",
|
| 840 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 841 |
+
" for i in range(nindex):\n",
|
| 842 |
+
" columns[i].append(rec[indexes[i]])\n",
|
| 843 |
+
" return columns\n",
|
| 844 |
+
"\n",
|
| 845 |
+
"def getFileColumnAsFloat(dirPath, index, delim=\",\"):\n",
|
| 846 |
+
" \"\"\"\n",
|
| 847 |
+
" get float fileds from a file\n",
|
| 848 |
+
"\n",
|
| 849 |
+
" Parameters\n",
|
| 850 |
+
" dirPath : file path\n",
|
| 851 |
+
" index : index\n",
|
| 852 |
+
" delim : delemeter\n",
|
| 853 |
+
" \"\"\"\n",
|
| 854 |
+
" #print(\"{} {}\".format(dirPath, index))\n",
|
| 855 |
+
" fields = getFileColumnAsString(dirPath, index, delim)\n",
|
| 856 |
+
" return list(map(lambda v:float(v), fields))\n",
|
| 857 |
+
"\n",
|
| 858 |
+
"def getFileColumnAsInt(dirPath, index, delim=\",\"):\n",
|
| 859 |
+
" \"\"\"\n",
|
| 860 |
+
" get float fileds from a file\n",
|
| 861 |
+
"\n",
|
| 862 |
+
" Parameters\n",
|
| 863 |
+
" dirPath : file path\n",
|
| 864 |
+
" index : index\n",
|
| 865 |
+
" delim : delemeter\n",
|
| 866 |
+
" \"\"\"\n",
|
| 867 |
+
" fields = getFileColumnAsString(dirPath, index, delim)\n",
|
| 868 |
+
" return list(map(lambda v:int(v), fields))\n",
|
| 869 |
+
"\n",
|
| 870 |
+
"def getFileAsIntMatrix(dirPath, columns, delim=\",\"):\n",
|
| 871 |
+
" \"\"\"\n",
|
| 872 |
+
" extracts int matrix from csv file given column indices with each row being concatenation of \n",
|
| 873 |
+
" extracted column values row size = num of columns\n",
|
| 874 |
+
"\n",
|
| 875 |
+
" Parameters\n",
|
| 876 |
+
" dirPath : file path\n",
|
| 877 |
+
" columns : indexes of columns\n",
|
| 878 |
+
" delim : delemeter\n",
|
| 879 |
+
" \"\"\"\n",
|
| 880 |
+
" mat = list()\n",
|
| 881 |
+
" for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
|
| 882 |
+
" mat.append(asIntList(rec))\n",
|
| 883 |
+
" return mat\n",
|
| 884 |
+
"\n",
|
| 885 |
+
"def getFileAsFloatMatrix(dirPath, columns, delim=\",\"):\n",
|
| 886 |
+
" \"\"\"\n",
|
| 887 |
+
" extracts float matrix from csv file given column indices with each row being concatenation of \n",
|
| 888 |
+
" extracted column values row size = num of columns\n",
|
| 889 |
+
" Parameters\n",
|
| 890 |
+
" dirPath : file path\n",
|
| 891 |
+
" columns : indexes of columns\n",
|
| 892 |
+
" delim : delemeter\n",
|
| 893 |
+
" \"\"\"\n",
|
| 894 |
+
" mat = list()\n",
|
| 895 |
+
" for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
|
| 896 |
+
" mat.append(asFloatList(rec))\n",
|
| 897 |
+
" return mat\n",
|
| 898 |
+
"\n",
|
| 899 |
+
"def getFileAsFloatColumn(dirPath):\n",
|
| 900 |
+
" \"\"\"\n",
|
| 901 |
+
" grt float list from a file with one float per row\n",
|
| 902 |
+
" Parameters\n",
|
| 903 |
+
" dirPath : file path\n",
|
| 904 |
+
" \"\"\"\n",
|
| 905 |
+
" flist = list()\n",
|
| 906 |
+
" for rec in fileRecGen(dirPath, None):\n",
|
| 907 |
+
" flist.append(float(rec))\n",
|
| 908 |
+
" return flist\n",
|
| 909 |
+
"\n",
|
| 910 |
+
"def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=\",\"):\n",
|
| 911 |
+
" \"\"\"\n",
|
| 912 |
+
" extracts float matrix from csv file given row filter and column indices with each row being \n",
|
| 913 |
+
" concatenation of extracted column values row size = num of columns\n",
|
| 914 |
+
" Parameters\n",
|
| 915 |
+
" dirPath : file path\n",
|
| 916 |
+
" columns : indexes of columns\n",
|
| 917 |
+
" filt : row filter lambda\n",
|
| 918 |
+
" delim : delemeter\n",
|
| 919 |
+
" \"\"\"\n",
|
| 920 |
+
" mat = list()\n",
|
| 921 |
+
" for rec in fileFiltSelFieldsRecGen(dirPath, filt, columns, delim):\n",
|
| 922 |
+
" mat.append(asFloatList(rec))\n",
|
| 923 |
+
" return mat\n",
|
| 924 |
+
"\n",
|
| 925 |
+
"def getFileAsTypedRecords(dirPath, types, delim=\",\"):\n",
|
| 926 |
+
" \"\"\"\n",
|
| 927 |
+
" extracts typed records from csv file with each row being concatenation of \n",
|
| 928 |
+
" extracted column values \n",
|
| 929 |
+
" Parameters\n",
|
| 930 |
+
" dirPath : file path\n",
|
| 931 |
+
" types : data types\n",
|
| 932 |
+
" delim : delemeter\n",
|
| 933 |
+
" \"\"\"\n",
|
| 934 |
+
" (dtypes, cvalues) = extractTypesFromString(types)\t\n",
|
| 935 |
+
" tdata = list()\n",
|
| 936 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 937 |
+
" trec = list()\n",
|
| 938 |
+
" for index, value in enumerate(rec):\n",
|
| 939 |
+
" value = __convToTyped(index, value, dtypes)\n",
|
| 940 |
+
" trec.append(value)\n",
|
| 941 |
+
" tdata.append(trec)\n",
|
| 942 |
+
" return tdata\n",
|
| 943 |
+
"\n",
|
| 944 |
+
"\n",
|
| 945 |
+
"def getFileColsAsTypedRecords(dirPath, columns, types, delim=\",\"):\n",
|
| 946 |
+
" \"\"\"\n",
|
| 947 |
+
" extracts typed records from csv file given column indices with each row being concatenation of \n",
|
| 948 |
+
" extracted column values \n",
|
| 949 |
+
" Parameters\n",
|
| 950 |
+
" Parameters\n",
|
| 951 |
+
" dirPath : file path\n",
|
| 952 |
+
" columns : column indexes\n",
|
| 953 |
+
" types : data types\n",
|
| 954 |
+
" delim : delemeter\n",
|
| 955 |
+
" \"\"\"\n",
|
| 956 |
+
" (dtypes, cvalues) = extractTypesFromString(types)\t\n",
|
| 957 |
+
" tdata = list()\n",
|
| 958 |
+
" for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
|
| 959 |
+
" trec = list()\n",
|
| 960 |
+
" for indx, value in enumerate(rec):\n",
|
| 961 |
+
" tindx = columns[indx]\n",
|
| 962 |
+
" value = __convToTyped(tindx, value, dtypes)\n",
|
| 963 |
+
" trec.append(value)\n",
|
| 964 |
+
" tdata.append(trec)\n",
|
| 965 |
+
" return tdata\n",
|
| 966 |
+
"\n",
|
| 967 |
+
"def getFileColumnsMinMax(dirPath, columns, dtype, delim=\",\"):\n",
|
| 968 |
+
" \"\"\"\n",
|
| 969 |
+
" extracts numeric matrix from csv file given column indices. For each column return min and max\n",
|
| 970 |
+
" Parameters\n",
|
| 971 |
+
" dirPath : file path\n",
|
| 972 |
+
" columns : column indexes\n",
|
| 973 |
+
" dtype : data type\n",
|
| 974 |
+
" delim : delemeter\n",
|
| 975 |
+
" \"\"\"\n",
|
| 976 |
+
" dtypes = list(map(lambda c : str(c) + \":\" + dtype, columns))\n",
|
| 977 |
+
" dtypes = \",\".join(dtypes)\n",
|
| 978 |
+
" #print(dtypes)\n",
|
| 979 |
+
"\n",
|
| 980 |
+
" tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim)\n",
|
| 981 |
+
" minMax = list()\n",
|
| 982 |
+
" ncola = len(tdata[0])\n",
|
| 983 |
+
" ncole = len(columns)\n",
|
| 984 |
+
" assertEqual(ncola, ncole, \"actual no of columns different from expected\")\n",
|
| 985 |
+
"\n",
|
| 986 |
+
" for ci in range(ncole):\t\n",
|
| 987 |
+
" vmin = sys.float_info.max\n",
|
| 988 |
+
" vmax = sys.float_info.min\n",
|
| 989 |
+
" for r in tdata:\n",
|
| 990 |
+
" cv = r[ci]\n",
|
| 991 |
+
" vmin = cv if cv < vmin else vmin\n",
|
| 992 |
+
" vmax = cv if cv > vmax else vmax\n",
|
| 993 |
+
" mm = (vmin, vmax, vmax - vmin)\n",
|
| 994 |
+
" minMax.append(mm)\n",
|
| 995 |
+
"\n",
|
| 996 |
+
" return minMax\n",
|
| 997 |
+
"\n",
|
| 998 |
+
"\n",
|
| 999 |
+
"def getRecAsTypedRecord(rec, types, delim=None):\n",
|
| 1000 |
+
" \"\"\"\n",
|
| 1001 |
+
" converts record to typed records \n",
|
| 1002 |
+
" Parameters\n",
|
| 1003 |
+
" rec : delemeter separate string or list of string\n",
|
| 1004 |
+
" types : field data types\n",
|
| 1005 |
+
" delim : delemeter\n",
|
| 1006 |
+
" \"\"\"\t\n",
|
| 1007 |
+
" if delim is not None:\n",
|
| 1008 |
+
" rec = rec.split(delim)\n",
|
| 1009 |
+
" (dtypes, cvalues) = extractTypesFromString(types)\t\n",
|
| 1010 |
+
" #print(types)\n",
|
| 1011 |
+
" #print(dtypes)\n",
|
| 1012 |
+
" trec = list()\n",
|
| 1013 |
+
" for ind, value in enumerate(rec):\n",
|
| 1014 |
+
" tvalue = __convToTyped(ind, value, dtypes)\n",
|
| 1015 |
+
" trec.append(tvalue)\n",
|
| 1016 |
+
" return trec\n",
|
| 1017 |
+
"\n",
|
| 1018 |
+
"def __convToTyped(index, value, dtypes):\n",
|
| 1019 |
+
" \"\"\"\n",
|
| 1020 |
+
" convert to typed value \n",
|
| 1021 |
+
" Parameters\n",
|
| 1022 |
+
" index : index in type list\n",
|
| 1023 |
+
" value : data value\n",
|
| 1024 |
+
" dtypes : data type list\n",
|
| 1025 |
+
" \"\"\"\n",
|
| 1026 |
+
" #print(index, value)\n",
|
| 1027 |
+
" dtype = dtypes[index]\n",
|
| 1028 |
+
" tvalue = value\n",
|
| 1029 |
+
" if dtype == \"int\":\n",
|
| 1030 |
+
" tvalue = int(value)\n",
|
| 1031 |
+
" elif dtype == \"float\":\n",
|
| 1032 |
+
" tvalue = float(value)\n",
|
| 1033 |
+
" return tvalue\n",
|
| 1034 |
+
"\n",
|
| 1035 |
+
"\n",
|
| 1036 |
+
"\n",
|
| 1037 |
+
"def extractTypesFromString(types):\n",
|
| 1038 |
+
" \"\"\"\n",
|
| 1039 |
+
" extracts column data types and set values for categorical variables \n",
|
| 1040 |
+
" Parameters\n",
|
| 1041 |
+
" types : encoded type information\n",
|
| 1042 |
+
" \"\"\"\n",
|
| 1043 |
+
" ftypes = types.split(\",\")\n",
|
| 1044 |
+
" dtypes = dict()\n",
|
| 1045 |
+
" cvalues = dict()\n",
|
| 1046 |
+
" for ftype in ftypes:\n",
|
| 1047 |
+
" items = ftype.split(\":\") \n",
|
| 1048 |
+
" cindex = int(items[0])\n",
|
| 1049 |
+
" dtype = items[1]\n",
|
| 1050 |
+
" dtypes[cindex] = dtype\n",
|
| 1051 |
+
" if len(items) == 3:\n",
|
| 1052 |
+
" sitems = items[2].split()\n",
|
| 1053 |
+
" cvalues[cindex] = sitems\n",
|
| 1054 |
+
" return (dtypes, cvalues)\n",
|
| 1055 |
+
"\n",
|
| 1056 |
+
"def getMultipleFileAsInttMatrix(dirPathWithCol, delim=\",\"):\n",
|
| 1057 |
+
" \"\"\"\n",
|
| 1058 |
+
" extracts int matrix from from csv files given column index for each file. \n",
|
| 1059 |
+
" num of columns = number of rows in each file and num of rows = number of files\n",
|
| 1060 |
+
" Parameters\n",
|
| 1061 |
+
" dirPathWithCol: list of file path and collumn index pair\n",
|
| 1062 |
+
" delim : delemeter\n",
|
| 1063 |
+
" \"\"\"\n",
|
| 1064 |
+
" mat = list()\n",
|
| 1065 |
+
" minLen = -1\n",
|
| 1066 |
+
" for path, col in dirPathWithCol:\n",
|
| 1067 |
+
" colVals = getFileColumnAsInt(path, col, delim)\n",
|
| 1068 |
+
" if minLen < 0 or len(colVals) < minLen:\n",
|
| 1069 |
+
" minLen = len(colVals)\n",
|
| 1070 |
+
" mat.append(colVals)\n",
|
| 1071 |
+
"\n",
|
| 1072 |
+
" #make all same length\n",
|
| 1073 |
+
" mat = list(map(lambda li:li[:minLen], mat))\t\n",
|
| 1074 |
+
" return mat\n",
|
| 1075 |
+
"\n",
|
| 1076 |
+
"def getMultipleFileAsFloatMatrix(dirPathWithCol, delim=\",\"):\n",
|
| 1077 |
+
" \"\"\"\n",
|
| 1078 |
+
" extracts float matrix from from csv files given column index for each file. \n",
|
| 1079 |
+
" num of columns = number of rows in each file and num of rows = number of files\n",
|
| 1080 |
+
" Parameters\n",
|
| 1081 |
+
" dirPathWithCol: list of file path and collumn index pair\n",
|
| 1082 |
+
" delim : delemeter\n",
|
| 1083 |
+
" \"\"\"\n",
|
| 1084 |
+
" mat = list()\n",
|
| 1085 |
+
" minLen = -1\n",
|
| 1086 |
+
" for path, col in dirPathWithCol:\n",
|
| 1087 |
+
" colVals = getFileColumnAsFloat(path, col, delim)\n",
|
| 1088 |
+
" if minLen < 0 or len(colVals) < minLen:\n",
|
| 1089 |
+
" minLen = len(colVals)\n",
|
| 1090 |
+
" mat.append(colVals)\n",
|
| 1091 |
+
"\n",
|
| 1092 |
+
" #make all same length\n",
|
| 1093 |
+
" mat = list(map(lambda li:li[:minLen], mat))\n",
|
| 1094 |
+
" return mat\n",
|
| 1095 |
+
"\n",
|
| 1096 |
+
"def writeStrListToFile(ldata, filePath, delem=\",\"):\n",
|
| 1097 |
+
" \"\"\"\n",
|
| 1098 |
+
" writes list of dlem separated string or list of list of string to afile\n",
|
| 1099 |
+
"\n",
|
| 1100 |
+
" Parameters\n",
|
| 1101 |
+
" ldata : list data\n",
|
| 1102 |
+
" filePath : file path\n",
|
| 1103 |
+
" delim : delemeter\n",
|
| 1104 |
+
" \"\"\"\n",
|
| 1105 |
+
" with open(filePath, \"w\") as fh:\n",
|
| 1106 |
+
" for r in ldata:\n",
|
| 1107 |
+
" if type(r) == list:\n",
|
| 1108 |
+
" r = delem.join(r)\n",
|
| 1109 |
+
" fh.write(r + \"\\n\")\n",
|
| 1110 |
+
"\n",
|
| 1111 |
+
"def writeFloatListToFile(ldata, prec, filePath):\n",
|
| 1112 |
+
" \"\"\"\n",
|
| 1113 |
+
" writes float list to file, one value per line\n",
|
| 1114 |
+
"\n",
|
| 1115 |
+
" Parameters\n",
|
| 1116 |
+
" ldata : list data\n",
|
| 1117 |
+
" prec : precision\n",
|
| 1118 |
+
" filePath : file path\n",
|
| 1119 |
+
" \"\"\"\n",
|
| 1120 |
+
" with open(filePath, \"w\") as fh:\n",
|
| 1121 |
+
" for d in ldata:\n",
|
| 1122 |
+
" fh.write(formatFloat(prec, d) + \"\\n\")\n",
|
| 1123 |
+
"\n",
|
| 1124 |
+
"\n",
|
| 1125 |
+
"def takeFirst(elems):\n",
|
| 1126 |
+
" \"\"\"\n",
|
| 1127 |
+
" return fisrt item\n",
|
| 1128 |
+
" Parameters\n",
|
| 1129 |
+
" elems : list of data \n",
|
| 1130 |
+
" \"\"\"\n",
|
| 1131 |
+
" return elems[0]\n",
|
| 1132 |
+
"\n",
|
| 1133 |
+
"def takeSecond(elems):\n",
|
| 1134 |
+
" \"\"\"\n",
|
| 1135 |
+
" return 2nd element\n",
|
| 1136 |
+
" Parameters\n",
|
| 1137 |
+
" elems : list of data \n",
|
| 1138 |
+
" \"\"\"\n",
|
| 1139 |
+
" return elems[1]\n",
|
| 1140 |
+
"\n",
|
| 1141 |
+
"def takeThird(elems):\n",
|
| 1142 |
+
" \"\"\"\n",
|
| 1143 |
+
" returns 3rd element\n",
|
| 1144 |
+
" Parameters\n",
|
| 1145 |
+
" elems : list of data \n",
|
| 1146 |
+
" \"\"\"\n",
|
| 1147 |
+
" return elems[2]\n",
|
| 1148 |
+
"\n",
|
| 1149 |
+
"def addToKeyedCounter(dCounter, key, count=1):\n",
|
| 1150 |
+
" \"\"\"\n",
|
| 1151 |
+
" add to to keyed counter\n",
|
| 1152 |
+
" Parameters\n",
|
| 1153 |
+
" dCounter : dictionary of counters\n",
|
| 1154 |
+
" key : dictionary key\n",
|
| 1155 |
+
" count : count to add\n",
|
| 1156 |
+
" \"\"\"\n",
|
| 1157 |
+
" curCount = dCounter.get(key, 0)\n",
|
| 1158 |
+
" dCounter[key] = curCount + count\n",
|
| 1159 |
+
"\n",
|
| 1160 |
+
"def incrKeyedCounter(dCounter, key):\n",
|
| 1161 |
+
" \"\"\"\n",
|
| 1162 |
+
" increment keyed counter\n",
|
| 1163 |
+
" Parameters\n",
|
| 1164 |
+
" dCounter : dictionary of counters\n",
|
| 1165 |
+
" key : dictionary key\n",
|
| 1166 |
+
" \"\"\"\n",
|
| 1167 |
+
" addToKeyedCounter(dCounter, key, 1)\n",
|
| 1168 |
+
"\n",
|
| 1169 |
+
"def appendKeyedList(dList, key, elem):\n",
|
| 1170 |
+
" \"\"\"\n",
|
| 1171 |
+
" keyed list\n",
|
| 1172 |
+
" Parameters\n",
|
| 1173 |
+
" dList : dictionary of lists\n",
|
| 1174 |
+
" key : dictionary key\n",
|
| 1175 |
+
" elem : value to append\n",
|
| 1176 |
+
" \"\"\"\n",
|
| 1177 |
+
" curList = dList.get(key, [])\n",
|
| 1178 |
+
" curList.append(elem)\n",
|
| 1179 |
+
" dList[key] = curList\n",
|
| 1180 |
+
"\n",
|
| 1181 |
+
"def isNumber(st):\n",
|
| 1182 |
+
" \"\"\"\n",
|
| 1183 |
+
" Returns True is string is a number\n",
|
| 1184 |
+
" Parameters\n",
|
| 1185 |
+
" st : string value\n",
|
| 1186 |
+
" \"\"\"\n",
|
| 1187 |
+
" return st.replace('.','',1).isdigit()\n",
|
| 1188 |
+
"\n",
|
| 1189 |
+
"def removeNan(values):\n",
|
| 1190 |
+
" \"\"\"\n",
|
| 1191 |
+
" removes nan from list\n",
|
| 1192 |
+
" Parameters\n",
|
| 1193 |
+
" values : list data\n",
|
| 1194 |
+
" \"\"\"\n",
|
| 1195 |
+
" return list(filter(lambda v: not math.isnan(v), values))\n",
|
| 1196 |
+
"\n",
|
| 1197 |
+
"def fileRecGen(filePath, delim = \",\"):\n",
|
| 1198 |
+
" \"\"\"\n",
|
| 1199 |
+
" file record generator\n",
|
| 1200 |
+
" Parameters\n",
|
| 1201 |
+
" filePath ; file path\n",
|
| 1202 |
+
" delim : delemeter\n",
|
| 1203 |
+
" \"\"\"\n",
|
| 1204 |
+
" with open(filePath, \"r\") as fp:\n",
|
| 1205 |
+
" for line in fp:\t\n",
|
| 1206 |
+
" line = line[:-1]\n",
|
| 1207 |
+
" if delim is not None:\n",
|
| 1208 |
+
" line = line.split(delim)\n",
|
| 1209 |
+
" yield line\n",
|
| 1210 |
+
"\n",
|
| 1211 |
+
"def fileSelFieldsRecGen(dirPath, columns, delim=\",\"):\n",
|
| 1212 |
+
" \"\"\"\n",
|
| 1213 |
+
" file record generator given column indices \n",
|
| 1214 |
+
" Parameters\n",
|
| 1215 |
+
" filePath ; file path\n",
|
| 1216 |
+
" columns : column indexes as int array or coma separated string\n",
|
| 1217 |
+
" delim : delemeter\n",
|
| 1218 |
+
" \"\"\"\n",
|
| 1219 |
+
" if type(columns) == str:\n",
|
| 1220 |
+
" columns = strToIntArray(columns, delim)\n",
|
| 1221 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 1222 |
+
" extracted = extractList(rec, columns)\n",
|
| 1223 |
+
" yield extracted\n",
|
| 1224 |
+
"\n",
|
| 1225 |
+
"def fileFiltRecGen(filePath, filt, delim = \",\"):\n",
|
| 1226 |
+
" \"\"\"\n",
|
| 1227 |
+
" file record generator with row filter applied\n",
|
| 1228 |
+
" Parameters\n",
|
| 1229 |
+
" filePath ; file path\n",
|
| 1230 |
+
" filt : row filter\n",
|
| 1231 |
+
" delim : delemeter\n",
|
| 1232 |
+
" \"\"\"\n",
|
| 1233 |
+
" with open(filePath, \"r\") as fp:\n",
|
| 1234 |
+
" for line in fp:\t\n",
|
| 1235 |
+
" line = line[:-1]\n",
|
| 1236 |
+
" if delim is not None:\n",
|
| 1237 |
+
" line = line.split(delim)\n",
|
| 1238 |
+
" if filt(line):\n",
|
| 1239 |
+
" yield line\n",
|
| 1240 |
+
"\n",
|
| 1241 |
+
"def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = \",\"):\n",
|
| 1242 |
+
" \"\"\"\n",
|
| 1243 |
+
" file record generator with row and column filter applied\n",
|
| 1244 |
+
" Parameters\n",
|
| 1245 |
+
" filePath ; file path\n",
|
| 1246 |
+
" filt : row filter\n",
|
| 1247 |
+
" columns : column indexes as int array or coma separated string\n",
|
| 1248 |
+
" delim : delemeter\n",
|
| 1249 |
+
" \"\"\"\n",
|
| 1250 |
+
" columns = strToIntArray(columns, delim)\n",
|
| 1251 |
+
" with open(filePath, \"r\") as fp:\n",
|
| 1252 |
+
" for line in fp:\t\n",
|
| 1253 |
+
" line = line[:-1]\n",
|
| 1254 |
+
" if delim is not None:\n",
|
| 1255 |
+
" line = line.split(delim)\n",
|
| 1256 |
+
" if filt(line):\n",
|
| 1257 |
+
" selected = extractList(line, columns)\n",
|
| 1258 |
+
" yield selected\n",
|
| 1259 |
+
"\n",
|
| 1260 |
+
"def fileTypedRecGen(filePath, ftypes, delim = \",\"):\n",
|
| 1261 |
+
" \"\"\"\n",
|
| 1262 |
+
" file typed record generator\n",
|
| 1263 |
+
" Parameters\n",
|
| 1264 |
+
" filePath ; file path\n",
|
| 1265 |
+
" ftypes : list of field types\n",
|
| 1266 |
+
" delim : delemeter\n",
|
| 1267 |
+
" \"\"\"\n",
|
| 1268 |
+
" with open(filePath, \"r\") as fp:\n",
|
| 1269 |
+
" for line in fp:\t\n",
|
| 1270 |
+
" line = line[:-1]\n",
|
| 1271 |
+
" line = line.split(delim)\n",
|
| 1272 |
+
" for i in range(0, len(ftypes), 2):\n",
|
| 1273 |
+
" ci = ftypes[i]\n",
|
| 1274 |
+
" dtype = ftypes[i+1]\n",
|
| 1275 |
+
" assertLesser(ci, len(line), \"index out of bound\")\n",
|
| 1276 |
+
" if dtype == \"int\":\n",
|
| 1277 |
+
" line[ci] = int(line[ci])\n",
|
| 1278 |
+
" elif dtype == \"float\":\n",
|
| 1279 |
+
" line[ci] = float(line[ci])\n",
|
| 1280 |
+
" else:\n",
|
| 1281 |
+
" exitWithMsg(\"invalid data type\")\n",
|
| 1282 |
+
" yield line\n",
|
| 1283 |
+
"\n",
|
| 1284 |
+
"def fileMutatedFieldsRecGen(dirPath, mutator, delim=\",\"):\n",
|
| 1285 |
+
" \"\"\"\n",
|
| 1286 |
+
" file record generator with some columns mutated \n",
|
| 1287 |
+
" Parameters\n",
|
| 1288 |
+
" dirPath ; file path\n",
|
| 1289 |
+
" mutator : row field mutator\n",
|
| 1290 |
+
" delim : delemeter\n",
|
| 1291 |
+
" \"\"\"\n",
|
| 1292 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 1293 |
+
" mutated = mutator(rec)\n",
|
| 1294 |
+
" yield mutated\n",
|
| 1295 |
+
"\n",
|
| 1296 |
+
"def tableSelFieldsFilter(tdata, columns):\n",
|
| 1297 |
+
" \"\"\"\n",
|
| 1298 |
+
" gets tabular data for selected columns \n",
|
| 1299 |
+
" Parameters\n",
|
| 1300 |
+
" tdata : tabular data\n",
|
| 1301 |
+
" columns : column indexes\n",
|
| 1302 |
+
" \"\"\"\n",
|
| 1303 |
+
" if areAllFieldsIncluded(tdata[0], columns):\n",
|
| 1304 |
+
" ntdata = tdata\n",
|
| 1305 |
+
" else:\n",
|
| 1306 |
+
" ntdata = list()\n",
|
| 1307 |
+
" for rec in tdata:\n",
|
| 1308 |
+
" #print(rec)\n",
|
| 1309 |
+
" #print(columns)\n",
|
| 1310 |
+
" nrec = extractList(rec, columns)\n",
|
| 1311 |
+
" ntdata.append(nrec)\n",
|
| 1312 |
+
" return ntdata\n",
|
| 1313 |
+
"\n",
|
| 1314 |
+
"\n",
|
| 1315 |
+
"def areAllFieldsIncluded(ldata, columns):\n",
|
| 1316 |
+
" \"\"\"\n",
|
| 1317 |
+
" return True id all indexes are in the columns\n",
|
| 1318 |
+
" Parameters\n",
|
| 1319 |
+
" ldata : list data\n",
|
| 1320 |
+
" columns : column indexes\n",
|
| 1321 |
+
" \"\"\"\n",
|
| 1322 |
+
" return list(range(len(ldata))) == columns\n",
|
| 1323 |
+
"\n",
|
| 1324 |
+
"def asIntList(items):\n",
|
| 1325 |
+
" \"\"\"\n",
|
| 1326 |
+
" returns int list\n",
|
| 1327 |
+
" Parameters\n",
|
| 1328 |
+
" items : list data\n",
|
| 1329 |
+
" \"\"\"\n",
|
| 1330 |
+
" return [int(i) for i in items]\n",
|
| 1331 |
+
"\n",
|
| 1332 |
+
"def asFloatList(items):\n",
|
| 1333 |
+
" \"\"\"\n",
|
| 1334 |
+
" returns float list\n",
|
| 1335 |
+
" Parameters\n",
|
| 1336 |
+
" items : list data\n",
|
| 1337 |
+
" \"\"\"\n",
|
| 1338 |
+
" return [float(i) for i in items]\n",
|
| 1339 |
+
"\n",
|
| 1340 |
+
"def pastTime(interval, unit):\n",
|
| 1341 |
+
" \"\"\"\n",
|
| 1342 |
+
" current and past time\n",
|
| 1343 |
+
" Parameters\n",
|
| 1344 |
+
" interval : time interval\n",
|
| 1345 |
+
" unit: time unit\n",
|
| 1346 |
+
" \"\"\"\n",
|
| 1347 |
+
" curTime = int(time.time())\n",
|
| 1348 |
+
" if unit == \"d\":\n",
|
| 1349 |
+
" pastTime = curTime - interval * secInDay\n",
|
| 1350 |
+
" elif unit == \"h\":\n",
|
| 1351 |
+
" pastTime = curTime - interval * secInHour\n",
|
| 1352 |
+
" elif unit == \"m\":\n",
|
| 1353 |
+
" pastTime = curTime - interval * secInMinute\n",
|
| 1354 |
+
" else:\n",
|
| 1355 |
+
" raise ValueError(\"invalid time unit \" + unit)\n",
|
| 1356 |
+
" return (curTime, pastTime)\n",
|
| 1357 |
+
"\n",
|
| 1358 |
+
"def minuteAlign(ts):\n",
|
| 1359 |
+
" \"\"\"\n",
|
| 1360 |
+
" minute aligned time\t\n",
|
| 1361 |
+
" Parameters\n",
|
| 1362 |
+
" ts : time stamp in sec\n",
|
| 1363 |
+
" \"\"\"\n",
|
| 1364 |
+
" return int((ts / secInMinute)) * secInMinute\n",
|
| 1365 |
+
"\n",
|
| 1366 |
+
"def multMinuteAlign(ts, min):\n",
|
| 1367 |
+
" \"\"\"\n",
|
| 1368 |
+
" multi minute aligned time\t\n",
|
| 1369 |
+
" Parameters\n",
|
| 1370 |
+
" ts : time stamp in sec\n",
|
| 1371 |
+
" min : minute value\n",
|
| 1372 |
+
" \"\"\"\n",
|
| 1373 |
+
" intv = secInMinute * min\n",
|
| 1374 |
+
" return int((ts / intv)) * intv\n",
|
| 1375 |
+
"\n",
|
| 1376 |
+
"def hourAlign(ts):\n",
|
| 1377 |
+
" \"\"\"\n",
|
| 1378 |
+
" hour aligned time\n",
|
| 1379 |
+
" Parameters\n",
|
| 1380 |
+
" ts : time stamp in sec\n",
|
| 1381 |
+
" \"\"\"\n",
|
| 1382 |
+
" return int((ts / secInHour)) * secInHour\n",
|
| 1383 |
+
"\n",
|
| 1384 |
+
"def hourOfDayAlign(ts, hour):\n",
|
| 1385 |
+
" \"\"\"\n",
|
| 1386 |
+
" hour of day aligned time\n",
|
| 1387 |
+
" Parameters\n",
|
| 1388 |
+
" ts : time stamp in sec\n",
|
| 1389 |
+
" hour : hour of day\n",
|
| 1390 |
+
" \"\"\"\n",
|
| 1391 |
+
" day = int(ts / secInDay)\n",
|
| 1392 |
+
" return (24 * day + hour) * secInHour\n",
|
| 1393 |
+
"\n",
|
| 1394 |
+
"def dayAlign(ts):\n",
|
| 1395 |
+
" \"\"\"\n",
|
| 1396 |
+
" day aligned time\n",
|
| 1397 |
+
" Parameters\n",
|
| 1398 |
+
" ts : time stamp in sec\n",
|
| 1399 |
+
" \"\"\"\n",
|
| 1400 |
+
" return int(ts / secInDay) * secInDay\n",
|
| 1401 |
+
"\n",
|
| 1402 |
+
"def timeAlign(ts, unit):\n",
|
| 1403 |
+
" \"\"\"\n",
|
| 1404 |
+
" boundary alignment of time\n",
|
| 1405 |
+
" Parameters\n",
|
| 1406 |
+
" ts : time stamp in sec\n",
|
| 1407 |
+
" unit : unit of time\n",
|
| 1408 |
+
" \"\"\"\n",
|
| 1409 |
+
" alignedTs = 0\n",
|
| 1410 |
+
" if unit == \"s\":\n",
|
| 1411 |
+
" alignedTs = ts\n",
|
| 1412 |
+
" elif unit == \"m\":\n",
|
| 1413 |
+
" alignedTs = minuteAlign(ts)\n",
|
| 1414 |
+
" elif unit == \"h\":\n",
|
| 1415 |
+
" alignedTs = hourAlign(ts)\n",
|
| 1416 |
+
" elif unit == \"d\":\n",
|
| 1417 |
+
" alignedTs = dayAlign(ts)\n",
|
| 1418 |
+
" else:\n",
|
| 1419 |
+
" raise ValueError(\"invalid time unit\")\n",
|
| 1420 |
+
" return \talignedTs\n",
|
| 1421 |
+
"\n",
|
| 1422 |
+
"def monthOfYear(ts):\n",
|
| 1423 |
+
" \"\"\"\n",
|
| 1424 |
+
" month of year\n",
|
| 1425 |
+
" Parameters\n",
|
| 1426 |
+
" ts : time stamp in sec\n",
|
| 1427 |
+
" \"\"\"\n",
|
| 1428 |
+
" rem = ts % secInYear\n",
|
| 1429 |
+
" dow = int(rem / secInMonth)\n",
|
| 1430 |
+
" return dow\n",
|
| 1431 |
+
"\n",
|
| 1432 |
+
"def dayOfWeek(ts):\n",
|
| 1433 |
+
" \"\"\"\n",
|
| 1434 |
+
" day of week\n",
|
| 1435 |
+
" Parameters\n",
|
| 1436 |
+
" ts : time stamp in sec\n",
|
| 1437 |
+
" \"\"\"\n",
|
| 1438 |
+
" rem = ts % secInWeek\n",
|
| 1439 |
+
" dow = int(rem / secInDay)\n",
|
| 1440 |
+
" return dow\n",
|
| 1441 |
+
"\n",
|
| 1442 |
+
"def hourOfDay(ts):\n",
|
| 1443 |
+
" \"\"\"\n",
|
| 1444 |
+
" hour of day\n",
|
| 1445 |
+
" Parameters\n",
|
| 1446 |
+
" ts : time stamp in sec\n",
|
| 1447 |
+
" \"\"\"\n",
|
| 1448 |
+
" rem = ts % secInDay\n",
|
| 1449 |
+
" hod = int(rem / secInHour)\n",
|
| 1450 |
+
" return hod\n",
|
| 1451 |
+
"\n",
|
| 1452 |
+
"def processCmdLineArgs(expectedTypes, usage):\n",
|
| 1453 |
+
" \"\"\"\n",
|
| 1454 |
+
" process command line args and returns args as typed values\n",
|
| 1455 |
+
" Parameters\n",
|
| 1456 |
+
" expectedTypes : expected data types of arguments\n",
|
| 1457 |
+
" usage : usage message string\n",
|
| 1458 |
+
" \"\"\"\n",
|
| 1459 |
+
" args = []\n",
|
| 1460 |
+
" numComLineArgs = len(sys.argv)\n",
|
| 1461 |
+
" numExpected = len(expectedTypes)\n",
|
| 1462 |
+
" if (numComLineArgs - 1 == len(expectedTypes)):\n",
|
| 1463 |
+
" try:\n",
|
| 1464 |
+
" for i in range(0, numExpected):\n",
|
| 1465 |
+
" if (expectedTypes[i] == typeInt):\n",
|
| 1466 |
+
" args.append(int(sys.argv[i+1]))\n",
|
| 1467 |
+
" elif (expectedTypes[i] == typeFloat):\n",
|
| 1468 |
+
" args.append(float(sys.argv[i+1]))\n",
|
| 1469 |
+
" elif (expectedTypes[i] == typeString):\n",
|
| 1470 |
+
" args.append(sys.argv[i+1])\n",
|
| 1471 |
+
" except ValueError:\n",
|
| 1472 |
+
" print (\"expected number of command line arguments found but there is type mis match\")\n",
|
| 1473 |
+
" sys.exit(1)\n",
|
| 1474 |
+
" else:\n",
|
| 1475 |
+
" print (\"expected number of command line arguments not found\")\n",
|
| 1476 |
+
" print (usage)\n",
|
| 1477 |
+
" sys.exit(1)\n",
|
| 1478 |
+
" return args\n",
|
| 1479 |
+
"\n",
|
| 1480 |
+
"def mutateString(val, numMutate, ctype):\n",
|
| 1481 |
+
" \"\"\"\n",
|
| 1482 |
+
" mutate string multiple times\n",
|
| 1483 |
+
" Parameters\n",
|
| 1484 |
+
" val : string value\n",
|
| 1485 |
+
" numMutate : num of mutations\n",
|
| 1486 |
+
" ctype : type of character to mutate with\n",
|
| 1487 |
+
" \"\"\"\n",
|
| 1488 |
+
" mutations = set()\n",
|
| 1489 |
+
" count = 0\n",
|
| 1490 |
+
" while count < numMutate:\n",
|
| 1491 |
+
" j = randint(0, len(val)-1)\n",
|
| 1492 |
+
" if j not in mutations:\n",
|
| 1493 |
+
" if ctype == \"alpha\":\n",
|
| 1494 |
+
" ch = selectRandomFromList(alphaTokens)\n",
|
| 1495 |
+
" elif ctype == \"num\":\n",
|
| 1496 |
+
" ch = selectRandomFromList(numTokens)\n",
|
| 1497 |
+
" elif ctype == \"any\":\n",
|
| 1498 |
+
" ch = selectRandomFromList(tokens)\n",
|
| 1499 |
+
" val = val[:j] + ch + val[j+1:]\n",
|
| 1500 |
+
" mutations.add(j)\n",
|
| 1501 |
+
" count += 1\n",
|
| 1502 |
+
" return val\n",
|
| 1503 |
+
"\n",
|
| 1504 |
+
"def mutateList(values, numMutate, vmin, vmax):\n",
|
| 1505 |
+
" \"\"\"\n",
|
| 1506 |
+
" mutate list multiple times\n",
|
| 1507 |
+
" Parameters\n",
|
| 1508 |
+
" values : list value\n",
|
| 1509 |
+
" numMutate : num of mutations\n",
|
| 1510 |
+
" vmin : minimum of value range\n",
|
| 1511 |
+
" vmax : maximum of value range\n",
|
| 1512 |
+
" \"\"\"\n",
|
| 1513 |
+
" mutations = set()\n",
|
| 1514 |
+
" count = 0\n",
|
| 1515 |
+
" while count < numMutate:\n",
|
| 1516 |
+
" j = randint(0, len(values)-1)\n",
|
| 1517 |
+
" if j not in mutations:\n",
|
| 1518 |
+
" values[j] = np.random.uniform(vmin, vmax)\n",
|
| 1519 |
+
" count += 1\n",
|
| 1520 |
+
" return values\n",
|
| 1521 |
+
"\n",
|
| 1522 |
+
"\n",
|
| 1523 |
+
"def swap(values, first, second):\n",
|
| 1524 |
+
" \"\"\"\n",
|
| 1525 |
+
" swap two elements\n",
|
| 1526 |
+
" Parameters\n",
|
| 1527 |
+
" values : list value\n",
|
| 1528 |
+
" first : first swap position\n",
|
| 1529 |
+
" second : second swap position\n",
|
| 1530 |
+
" \"\"\"\n",
|
| 1531 |
+
" t = values[first]\n",
|
| 1532 |
+
" values[first] = values[second]\n",
|
| 1533 |
+
" values[second] = t\n",
|
| 1534 |
+
"\n",
|
| 1535 |
+
"def swapBetweenLists(values1, values2):\n",
|
| 1536 |
+
" \"\"\"\n",
|
| 1537 |
+
" swap two elements between 2 lists\n",
|
| 1538 |
+
" Parameters\n",
|
| 1539 |
+
" values1 : first list of values\n",
|
| 1540 |
+
" values2 : second list of values\n",
|
| 1541 |
+
" \"\"\"\n",
|
| 1542 |
+
" p1 = randint(0, len(values1)-1)\n",
|
| 1543 |
+
" p2 = randint(0, len(values2)-1)\n",
|
| 1544 |
+
" tmp = values1[p1]\t\n",
|
| 1545 |
+
" values1[p1] = values2[p2]\n",
|
| 1546 |
+
" values2[p2] = tmp\n",
|
| 1547 |
+
"\n",
|
| 1548 |
+
"def safeAppend(values, value):\n",
|
| 1549 |
+
" \"\"\"\n",
|
| 1550 |
+
" append only if not None\n",
|
| 1551 |
+
" Parameters\n",
|
| 1552 |
+
" values : list value\n",
|
| 1553 |
+
" value : value to append\n",
|
| 1554 |
+
" \"\"\"\n",
|
| 1555 |
+
" if value is not None:\n",
|
| 1556 |
+
" values.append(value)\n",
|
| 1557 |
+
"\n",
|
| 1558 |
+
"def getAllIndex(ldata, fldata):\n",
|
| 1559 |
+
" \"\"\"\n",
|
| 1560 |
+
" get ALL indexes of list elements\n",
|
| 1561 |
+
" Parameters\n",
|
| 1562 |
+
" ldata : list data to find index in\n",
|
| 1563 |
+
" fldata : list data for values for index look up\n",
|
| 1564 |
+
" \"\"\"\n",
|
| 1565 |
+
" return list(map(lambda e : fldata.index(e), ldata))\n",
|
| 1566 |
+
"\n",
|
| 1567 |
+
"def findIntersection(lOne, lTwo):\n",
|
| 1568 |
+
" \"\"\"\n",
|
| 1569 |
+
" find intersection elements between 2 lists\n",
|
| 1570 |
+
" Parameters\n",
|
| 1571 |
+
" lOne : first list of data\n",
|
| 1572 |
+
" lTwo : second list of data\n",
|
| 1573 |
+
" \"\"\"\n",
|
| 1574 |
+
" sOne = set(lOne)\n",
|
| 1575 |
+
" sTwo = set(lTwo)\n",
|
| 1576 |
+
" sInt = sOne.intersection(sTwo)\n",
|
| 1577 |
+
" return list(sInt)\n",
|
| 1578 |
+
"\n",
|
| 1579 |
+
"def isIntvOverlapped(rOne, rTwo):\n",
|
| 1580 |
+
" \"\"\"\n",
|
| 1581 |
+
" checks overlap between 2 intervals\n",
|
| 1582 |
+
" Parameters\n",
|
| 1583 |
+
" rOne : first interval boundaries\n",
|
| 1584 |
+
" rTwo : second interval boundaries\n",
|
| 1585 |
+
" \"\"\"\n",
|
| 1586 |
+
" clear = rOne[1] <= rTwo[0] or rOne[0] >= rTwo[1] \n",
|
| 1587 |
+
" return not clear\n",
|
| 1588 |
+
"\n",
|
| 1589 |
+
"def isIntvLess(rOne, rTwo):\n",
|
| 1590 |
+
" \"\"\"\n",
|
| 1591 |
+
" checks if first iterval is less than second\n",
|
| 1592 |
+
" Parameters\n",
|
| 1593 |
+
" rOne : first interval boundaries\n",
|
| 1594 |
+
" rTwo : second interval boundaries\n",
|
| 1595 |
+
" \"\"\"\n",
|
| 1596 |
+
" less = rOne[1] <= rTwo[0] \n",
|
| 1597 |
+
" return less\n",
|
| 1598 |
+
"\n",
|
| 1599 |
+
"def findRank(e, values):\n",
|
| 1600 |
+
" \"\"\"\n",
|
| 1601 |
+
" find rank of value in a list\n",
|
| 1602 |
+
" Parameters\n",
|
| 1603 |
+
" e : value to compare with\n",
|
| 1604 |
+
" values : list data\n",
|
| 1605 |
+
" \"\"\"\n",
|
| 1606 |
+
" count = 1\n",
|
| 1607 |
+
" for ve in values:\n",
|
| 1608 |
+
" if ve < e:\n",
|
| 1609 |
+
" count += 1\n",
|
| 1610 |
+
" return count\n",
|
| 1611 |
+
"\n",
|
| 1612 |
+
"def findRanks(toBeRanked, values):\n",
|
| 1613 |
+
" \"\"\"\n",
|
| 1614 |
+
" find ranks of values in one list in another list\n",
|
| 1615 |
+
" Parameters\n",
|
| 1616 |
+
" toBeRanked : list of values for which ranks are found\n",
|
| 1617 |
+
" values : list in which rank is found : \n",
|
| 1618 |
+
" \"\"\"\n",
|
| 1619 |
+
" return list(map(lambda e: findRank(e, values), toBeRanked))\n",
|
| 1620 |
+
"\n",
|
| 1621 |
+
"def formatFloat(prec, value, label = None):\n",
|
| 1622 |
+
" \"\"\"\n",
|
| 1623 |
+
" formats a float with optional label\n",
|
| 1624 |
+
" Parameters\n",
|
| 1625 |
+
" prec : precision\n",
|
| 1626 |
+
" value : data value\n",
|
| 1627 |
+
" label : label for data\n",
|
| 1628 |
+
" \"\"\"\n",
|
| 1629 |
+
" st = (label + \" \") if label else \"\"\n",
|
| 1630 |
+
" formatter = \"{:.\" + str(prec) + \"f}\" \n",
|
| 1631 |
+
" return st + formatter.format(value)\n",
|
| 1632 |
+
"\n",
|
| 1633 |
+
"def formatAny(value, label = None):\n",
|
| 1634 |
+
" \"\"\"\n",
|
| 1635 |
+
" formats any obkect with optional label\n",
|
| 1636 |
+
" Parameters\n",
|
| 1637 |
+
" value : data value\n",
|
| 1638 |
+
" label : label for data\n",
|
| 1639 |
+
" \"\"\"\n",
|
| 1640 |
+
" st = (label + \" \") if label else \"\"\n",
|
| 1641 |
+
" return st + str(value)\n",
|
| 1642 |
+
"\n",
|
| 1643 |
+
"def printList(values):\n",
|
| 1644 |
+
" \"\"\"\n",
|
| 1645 |
+
" pretty print list\n",
|
| 1646 |
+
" Parameters\n",
|
| 1647 |
+
" values : list of values\n",
|
| 1648 |
+
" \"\"\"\n",
|
| 1649 |
+
" for v in values:\n",
|
| 1650 |
+
" print(v)\n",
|
| 1651 |
+
"\n",
|
| 1652 |
+
"def printMap(values, klab, vlab, precision, offset=16):\n",
|
| 1653 |
+
" \"\"\"\n",
|
| 1654 |
+
" pretty print hash map\n",
|
| 1655 |
+
" Parameters\n",
|
| 1656 |
+
" values : dictionary of values\n",
|
| 1657 |
+
" klab : label for key\n",
|
| 1658 |
+
" vlab : label for value\n",
|
| 1659 |
+
" precision : precision\n",
|
| 1660 |
+
" offset : left justify offset\n",
|
| 1661 |
+
" \"\"\"\n",
|
| 1662 |
+
" print(klab.ljust(offset, \" \") + vlab)\n",
|
| 1663 |
+
" for k in values.keys():\n",
|
| 1664 |
+
" v = values[k]\n",
|
| 1665 |
+
" ks = toStr(k, precision).ljust(offset, \" \")\n",
|
| 1666 |
+
" vs = toStr(v, precision)\n",
|
| 1667 |
+
" print(ks + vs)\n",
|
| 1668 |
+
"\n",
|
| 1669 |
+
"def printPairList(values, lab1, lab2, precision, offset=16):\n",
|
| 1670 |
+
" \"\"\"\n",
|
| 1671 |
+
" pretty print list of pairs\n",
|
| 1672 |
+
" Parameters\n",
|
| 1673 |
+
" values : dictionary of values\n",
|
| 1674 |
+
" lab1 : first label\n",
|
| 1675 |
+
" lab2 : second label\n",
|
| 1676 |
+
" precision : precision\n",
|
| 1677 |
+
" offset : left justify offset\n",
|
| 1678 |
+
" \"\"\"\n",
|
| 1679 |
+
" print(lab1.ljust(offset, \" \") + lab2)\n",
|
| 1680 |
+
" for (v1, v2) in values:\n",
|
| 1681 |
+
" sv1 = toStr(v1, precision).ljust(offset, \" \")\n",
|
| 1682 |
+
" sv2 = toStr(v2, precision)\n",
|
| 1683 |
+
" print(sv1 + sv2)\n",
|
| 1684 |
+
"\n",
|
| 1685 |
+
"def createMap(*values):\n",
|
| 1686 |
+
" \"\"\"\n",
|
| 1687 |
+
" create disctionary with results\n",
|
| 1688 |
+
" Parameters\n",
|
| 1689 |
+
" values : sequence of key value pairs\n",
|
| 1690 |
+
" \"\"\"\n",
|
| 1691 |
+
" result = dict()\n",
|
| 1692 |
+
" for i in range(0, len(values), 2):\n",
|
| 1693 |
+
" result[values[i]] = values[i+1]\n",
|
| 1694 |
+
" return result\n",
|
| 1695 |
+
"\n",
|
| 1696 |
+
"def getColMinMax(table, col):\n",
|
| 1697 |
+
" \"\"\"\n",
|
| 1698 |
+
" return min, max values of a column\n",
|
| 1699 |
+
" Parameters\n",
|
| 1700 |
+
" table : tabular data\n",
|
| 1701 |
+
" col : column index\n",
|
| 1702 |
+
" \"\"\"\n",
|
| 1703 |
+
" vmin = None\n",
|
| 1704 |
+
" vmax = None\n",
|
| 1705 |
+
" for rec in table:\n",
|
| 1706 |
+
" value = rec[col]\n",
|
| 1707 |
+
" if vmin is None:\n",
|
| 1708 |
+
" vmin = value\n",
|
| 1709 |
+
" vmax = value\n",
|
| 1710 |
+
" else:\n",
|
| 1711 |
+
" if value < vmin:\n",
|
| 1712 |
+
" vmin = value\n",
|
| 1713 |
+
" elif value > vmax:\n",
|
| 1714 |
+
" vmax = value\n",
|
| 1715 |
+
" return (vmin, vmax, vmax - vmin)\n",
|
| 1716 |
+
"\n",
|
| 1717 |
+
"def createLogger(name, logFilePath, logLevName):\n",
|
| 1718 |
+
" \"\"\"\n",
|
| 1719 |
+
" creates logger\n",
|
| 1720 |
+
" Parameters\n",
|
| 1721 |
+
" name : logger name\n",
|
| 1722 |
+
" logFilePath : log file path\n",
|
| 1723 |
+
" logLevName : log level\n",
|
| 1724 |
+
" \"\"\"\n",
|
| 1725 |
+
" logger = logging.getLogger(name)\n",
|
| 1726 |
+
" fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4)\n",
|
| 1727 |
+
" logLev = logLevName.lower()\n",
|
| 1728 |
+
" if logLev == \"debug\":\n",
|
| 1729 |
+
" logLevel = logging.DEBUG\n",
|
| 1730 |
+
" elif logLev == \"info\":\n",
|
| 1731 |
+
" logLevel = logging.INFO\n",
|
| 1732 |
+
" elif logLev == \"warning\":\n",
|
| 1733 |
+
" logLevel = logging.WARNING\n",
|
| 1734 |
+
" elif logLev == \"error\":\n",
|
| 1735 |
+
" logLevel = logging.ERROR\n",
|
| 1736 |
+
" elif logLev == \"critical\":\n",
|
| 1737 |
+
" logLevel = logging.CRITICAL\n",
|
| 1738 |
+
" else:\n",
|
| 1739 |
+
" raise ValueError(\"invalid log level name \" + logLevelName)\n",
|
| 1740 |
+
" fHandler.setLevel(logLevel)\n",
|
| 1741 |
+
" fFormat = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n",
|
| 1742 |
+
" fHandler.setFormatter(fFormat)\n",
|
| 1743 |
+
" logger.addHandler(fHandler)\n",
|
| 1744 |
+
" logger.setLevel(logLevel)\n",
|
| 1745 |
+
" return logger\n",
|
| 1746 |
+
"\n",
|
| 1747 |
+
"@contextmanager\n",
|
| 1748 |
+
"def suppressStdout():\n",
|
| 1749 |
+
" \"\"\"\n",
|
| 1750 |
+
" suppress stdout\n",
|
| 1751 |
+
" Parameters\n",
|
| 1752 |
+
" \"\"\"\n",
|
| 1753 |
+
" with open(os.devnull, \"w\") as devnull:\n",
|
| 1754 |
+
" oldStdout = sys.stdout\n",
|
| 1755 |
+
" sys.stdout = devnull\n",
|
| 1756 |
+
" try: \n",
|
| 1757 |
+
" yield\n",
|
| 1758 |
+
" finally:\n",
|
| 1759 |
+
" sys.stdout = oldStdout\n",
|
| 1760 |
+
"\n",
|
| 1761 |
+
"def exitWithMsg(msg):\n",
|
| 1762 |
+
" \"\"\"\n",
|
| 1763 |
+
" print message and exit\n",
|
| 1764 |
+
" Parameters\n",
|
| 1765 |
+
" msg : message\n",
|
| 1766 |
+
" \"\"\"\n",
|
| 1767 |
+
" print(msg + \" -- quitting\")\n",
|
| 1768 |
+
" sys.exit(0)\n",
|
| 1769 |
+
"\n",
|
| 1770 |
+
"def drawLine(data, yscale=None):\n",
|
| 1771 |
+
" \"\"\"\n",
|
| 1772 |
+
" line plot\n",
|
| 1773 |
+
" Parameters\n",
|
| 1774 |
+
" data : list data\n",
|
| 1775 |
+
" yscale : y axis scale\n",
|
| 1776 |
+
" \"\"\"\n",
|
| 1777 |
+
" plt.plot(data)\n",
|
| 1778 |
+
" if yscale:\n",
|
| 1779 |
+
" step = int(yscale / 10)\n",
|
| 1780 |
+
" step = int(step / 10) * 10\n",
|
| 1781 |
+
" plt.yticks(range(0, yscale, step))\n",
|
| 1782 |
+
" plt.show()\n",
|
| 1783 |
+
"\n",
|
| 1784 |
+
"def drawPlot(x, y, xlabel, ylabel):\n",
|
| 1785 |
+
" \"\"\"\n",
|
| 1786 |
+
" line plot\n",
|
| 1787 |
+
" Parameters\n",
|
| 1788 |
+
" x : x values\n",
|
| 1789 |
+
" y : y values\n",
|
| 1790 |
+
" xlabel : x axis label\n",
|
| 1791 |
+
" ylabel : y axis label\n",
|
| 1792 |
+
" \"\"\"\n",
|
| 1793 |
+
" plt.plot(x,y)\n",
|
| 1794 |
+
" plt.xlabel(xlabel)\n",
|
| 1795 |
+
" plt.ylabel(ylabel)\n",
|
| 1796 |
+
" plt.show()\n",
|
| 1797 |
+
"\n",
|
| 1798 |
+
"def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label):\n",
|
| 1799 |
+
" \"\"\"\n",
|
| 1800 |
+
" line plot of 2 lines\n",
|
| 1801 |
+
" Parameters\n",
|
| 1802 |
+
" x : x values\n",
|
| 1803 |
+
" y1 : first y values\n",
|
| 1804 |
+
" y2 : second y values\n",
|
| 1805 |
+
" xlabel : x labbel\n",
|
| 1806 |
+
" ylabel : y label\n",
|
| 1807 |
+
" y1label : first plot label\n",
|
| 1808 |
+
" y2label : second plot label\n",
|
| 1809 |
+
" \"\"\"\n",
|
| 1810 |
+
" plt.plot(x, y1, label = y1label)\n",
|
| 1811 |
+
" plt.plot(x, y2, label = y2label)\n",
|
| 1812 |
+
" plt.xlabel(xlabel)\n",
|
| 1813 |
+
" plt.ylabel(ylabel)\n",
|
| 1814 |
+
" plt.legend()\n",
|
| 1815 |
+
" plt.show()\n",
|
| 1816 |
+
"\n",
|
| 1817 |
+
"def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10):\n",
|
| 1818 |
+
" \"\"\"\n",
|
| 1819 |
+
" draw histogram\n",
|
| 1820 |
+
" Parameters\n",
|
| 1821 |
+
" ldata : list data\n",
|
| 1822 |
+
" myTitle : title\n",
|
| 1823 |
+
" myXlabel : x label\n",
|
| 1824 |
+
" myYlabel : y label \n",
|
| 1825 |
+
" nbins : num of bins\n",
|
| 1826 |
+
" \"\"\"\n",
|
| 1827 |
+
" plt.hist(ldata, bins=nbins, density=True)\n",
|
| 1828 |
+
" plt.title(myTitle)\n",
|
| 1829 |
+
" plt.xlabel(myXlabel)\n",
|
| 1830 |
+
" plt.ylabel(myYlabel)\n",
|
| 1831 |
+
" plt.show()\t\n",
|
| 1832 |
+
"\n",
|
| 1833 |
+
"def saveObject(obj, filePath):\n",
|
| 1834 |
+
" \"\"\"\n",
|
| 1835 |
+
" saves an object\n",
|
| 1836 |
+
" Parameters\n",
|
| 1837 |
+
" obj : object\n",
|
| 1838 |
+
" filePath : file path for saved object\n",
|
| 1839 |
+
" \"\"\"\n",
|
| 1840 |
+
" with open(filePath, \"wb\") as outfile:\n",
|
| 1841 |
+
" pickle.dump(obj,outfile)\n",
|
| 1842 |
+
"\n",
|
| 1843 |
+
"def restoreObject(filePath):\n",
|
| 1844 |
+
" \"\"\"\n",
|
| 1845 |
+
" restores an object\n",
|
| 1846 |
+
" Parameters\n",
|
| 1847 |
+
" filePath : file path to restore object from\n",
|
| 1848 |
+
" \"\"\"\n",
|
| 1849 |
+
" with open(filePath, \"rb\") as infile:\n",
|
| 1850 |
+
" obj = pickle.load(infile)\n",
|
| 1851 |
+
" return obj\n",
|
| 1852 |
+
"\n",
|
| 1853 |
+
"def isNumeric(data):\n",
|
| 1854 |
+
" \"\"\"\n",
|
| 1855 |
+
" true if all elements int or float\n",
|
| 1856 |
+
" Parameters\n",
|
| 1857 |
+
" data : numeric data list\n",
|
| 1858 |
+
" \"\"\"\n",
|
| 1859 |
+
" if type(data) == list or type(data) == np.ndarray:\n",
|
| 1860 |
+
" col = pd.Series(data)\n",
|
| 1861 |
+
" else:\n",
|
| 1862 |
+
" col = data\n",
|
| 1863 |
+
" return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64\n",
|
| 1864 |
+
"\n",
|
| 1865 |
+
"def isInteger(data):\n",
|
| 1866 |
+
" \"\"\"\n",
|
| 1867 |
+
" true if all elements int \n",
|
| 1868 |
+
" Parameters\n",
|
| 1869 |
+
" data : numeric data list\n",
|
| 1870 |
+
" \"\"\"\n",
|
| 1871 |
+
" if type(data) == list or type(data) == np.ndarray:\n",
|
| 1872 |
+
" col = pd.Series(data)\n",
|
| 1873 |
+
" else:\n",
|
| 1874 |
+
" col = data\n",
|
| 1875 |
+
" return col.dtype == np.int32 or col.dtype == np.int64\n",
|
| 1876 |
+
"\n",
|
| 1877 |
+
"def isFloat(data):\n",
|
| 1878 |
+
" \"\"\"\n",
|
| 1879 |
+
" true if all elements float\n",
|
| 1880 |
+
" Parameters\n",
|
| 1881 |
+
" data : numeric data list\n",
|
| 1882 |
+
" \"\"\"\n",
|
| 1883 |
+
" if type(data) == list or type(data) == np.ndarray:\n",
|
| 1884 |
+
" col = pd.Series(data)\n",
|
| 1885 |
+
" else:\n",
|
| 1886 |
+
" col = data\n",
|
| 1887 |
+
" return col.dtype == np.float32 or col.dtype == np.float64\n",
|
| 1888 |
+
"\n",
|
| 1889 |
+
"def isBinary(data):\n",
|
| 1890 |
+
" \"\"\"\n",
|
| 1891 |
+
" true if all elements either 0 or 1\n",
|
| 1892 |
+
" Parameters\n",
|
| 1893 |
+
" data : binary data\n",
|
| 1894 |
+
" \"\"\"\n",
|
| 1895 |
+
" re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None)\n",
|
| 1896 |
+
" return (re is None)\n",
|
| 1897 |
+
"\n",
|
| 1898 |
+
"def isCategorical(data):\n",
|
| 1899 |
+
" \"\"\"\n",
|
| 1900 |
+
" true if all elements int or string\n",
|
| 1901 |
+
" Parameters\n",
|
| 1902 |
+
" data : data value\n",
|
| 1903 |
+
" \"\"\"\n",
|
| 1904 |
+
" re = next((d for d in data if not (type(d) == int or type(d) == str)), None)\n",
|
| 1905 |
+
" return (re is None)\n",
|
| 1906 |
+
"\n",
|
| 1907 |
+
"def assertEqual(value, veq, msg):\n",
|
| 1908 |
+
" \"\"\"\n",
|
| 1909 |
+
" assert equal to\n",
|
| 1910 |
+
" Parameters\n",
|
| 1911 |
+
" value : value\n",
|
| 1912 |
+
" veq : value to be equated with\n",
|
| 1913 |
+
" msg : error msg\n",
|
| 1914 |
+
" \"\"\"\n",
|
| 1915 |
+
" assert value == veq , msg\n",
|
| 1916 |
+
"\n",
|
| 1917 |
+
"def assertGreater(value, vmin, msg):\n",
|
| 1918 |
+
" \"\"\"\n",
|
| 1919 |
+
" assert greater than \n",
|
| 1920 |
+
" Parameters\n",
|
| 1921 |
+
" value : value\n",
|
| 1922 |
+
" vmin : minimum value\n",
|
| 1923 |
+
" msg : error msg\n",
|
| 1924 |
+
" \"\"\"\n",
|
| 1925 |
+
" assert value > vmin , msg\n",
|
| 1926 |
+
"\n",
|
| 1927 |
+
"def assertGreaterEqual(value, vmin, msg):\n",
|
| 1928 |
+
" \"\"\"\n",
|
| 1929 |
+
" assert greater than \n",
|
| 1930 |
+
" Parameters\n",
|
| 1931 |
+
" value : value\n",
|
| 1932 |
+
" vmin : minimum value\n",
|
| 1933 |
+
" msg : error msg\n",
|
| 1934 |
+
" \"\"\"\n",
|
| 1935 |
+
" assert value >= vmin , msg\n",
|
| 1936 |
+
"\n",
|
| 1937 |
+
"def assertLesser(value, vmax, msg):\n",
|
| 1938 |
+
" \"\"\"\n",
|
| 1939 |
+
" assert less than\n",
|
| 1940 |
+
" Parameters\n",
|
| 1941 |
+
" value : value\n",
|
| 1942 |
+
" vmax : maximum value\n",
|
| 1943 |
+
" msg : error msg\n",
|
| 1944 |
+
" \"\"\"\n",
|
| 1945 |
+
" assert value < vmax , msg\n",
|
| 1946 |
+
"\n",
|
| 1947 |
+
"def assertLesserEqual(value, vmax, msg):\n",
|
| 1948 |
+
" \"\"\"\n",
|
| 1949 |
+
" assert less than\n",
|
| 1950 |
+
" Parameters\n",
|
| 1951 |
+
" value : value\n",
|
| 1952 |
+
" vmax : maximum value\n",
|
| 1953 |
+
" msg : error msg\n",
|
| 1954 |
+
" \"\"\"\n",
|
| 1955 |
+
" assert value <= vmax , msg\n",
|
| 1956 |
+
"\n",
|
| 1957 |
+
"def assertWithinRange(value, vmin, vmax, msg):\n",
|
| 1958 |
+
" \"\"\"\n",
|
| 1959 |
+
" assert within range\n",
|
| 1960 |
+
" Parameters\n",
|
| 1961 |
+
" value : value\n",
|
| 1962 |
+
" vmin : minimum value\n",
|
| 1963 |
+
" vmax : maximum value\n",
|
| 1964 |
+
" msg : error msg\n",
|
| 1965 |
+
" \"\"\"\n",
|
| 1966 |
+
" assert value >= vmin and value <= vmax, msg\n",
|
| 1967 |
+
"\n",
|
| 1968 |
+
"def assertInList(value, values, msg):\n",
|
| 1969 |
+
" \"\"\"\n",
|
| 1970 |
+
" assert contains in a list\n",
|
| 1971 |
+
" Parameters\n",
|
| 1972 |
+
" value ; balue to check for inclusion\n",
|
| 1973 |
+
" values : list data\n",
|
| 1974 |
+
" msg : error msg\n",
|
| 1975 |
+
" \"\"\"\n",
|
| 1976 |
+
" assert value in values, msg\n",
|
| 1977 |
+
"\n",
|
| 1978 |
+
"def maxListDist(l1, l2):\n",
|
| 1979 |
+
" \"\"\"\n",
|
| 1980 |
+
" maximum list element difference between 2 lists\n",
|
| 1981 |
+
" Parameters\n",
|
| 1982 |
+
" l1 : first list data\n",
|
| 1983 |
+
" l2 : second list data\n",
|
| 1984 |
+
" \"\"\"\n",
|
| 1985 |
+
" dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2))))\t\n",
|
| 1986 |
+
" return dist\n",
|
| 1987 |
+
"\n",
|
| 1988 |
+
"def fileLineCount(fPath):\n",
|
| 1989 |
+
" \"\"\" \n",
|
| 1990 |
+
" number of lines ina file \n",
|
| 1991 |
+
" Parameters\n",
|
| 1992 |
+
" fPath : file path\n",
|
| 1993 |
+
" \"\"\"\n",
|
| 1994 |
+
" with open(fPath) as f:\n",
|
| 1995 |
+
" for i, li in enumerate(f):\n",
|
| 1996 |
+
" pass\n",
|
| 1997 |
+
" return (i + 1)\n",
|
| 1998 |
+
"\n",
|
| 1999 |
+
"def getAlphaNumCharCount(sdata):\n",
|
| 2000 |
+
" \"\"\" \n",
|
| 2001 |
+
" number of alphabetic and numeric charcters in a string \n",
|
| 2002 |
+
" Parameters\n",
|
| 2003 |
+
" sdata : string data\n",
|
| 2004 |
+
" \"\"\"\n",
|
| 2005 |
+
" acount = 0\n",
|
| 2006 |
+
" ncount = 0\n",
|
| 2007 |
+
" scount = 0\n",
|
| 2008 |
+
" ocount = 0\n",
|
| 2009 |
+
" assertEqual(type(sdata), str, \"input must be string\")\n",
|
| 2010 |
+
" for c in sdata:\n",
|
| 2011 |
+
" if c.isnumeric():\n",
|
| 2012 |
+
" ncount += 1\n",
|
| 2013 |
+
" elif c.isalpha():\n",
|
| 2014 |
+
" acount += 1\n",
|
| 2015 |
+
" elif c.isspace():\n",
|
| 2016 |
+
" scount += 1\n",
|
| 2017 |
+
" else:\n",
|
| 2018 |
+
" ocount += 1\n",
|
| 2019 |
+
" r = (acount, ncount, ocount)\n",
|
| 2020 |
+
" return r\n",
|
| 2021 |
+
"\n",
|
| 2022 |
+
"class StepFunction:\n",
|
| 2023 |
+
" \"\"\"\n",
|
| 2024 |
+
" step function\n",
|
| 2025 |
+
" Parameters\n",
|
| 2026 |
+
" \"\"\"\n",
|
| 2027 |
+
" def __init__(self, *values):\n",
|
| 2028 |
+
" \"\"\"\n",
|
| 2029 |
+
" initilizer\n",
|
| 2030 |
+
"\n",
|
| 2031 |
+
" Parameters\n",
|
| 2032 |
+
" values : list of tuples, wich each tuple containing 2 x values and corresponding y value\n",
|
| 2033 |
+
" \"\"\"\n",
|
| 2034 |
+
" self.points = values\n",
|
| 2035 |
+
"\n",
|
| 2036 |
+
" def find(self, x):\n",
|
| 2037 |
+
" \"\"\"\n",
|
| 2038 |
+
" finds step function value\n",
|
| 2039 |
+
"\n",
|
| 2040 |
+
" Parameters\n",
|
| 2041 |
+
" x : x value\n",
|
| 2042 |
+
" \"\"\"\n",
|
| 2043 |
+
" found = False\n",
|
| 2044 |
+
" y = 0\n",
|
| 2045 |
+
" for p in self.points:\n",
|
| 2046 |
+
" if (x >= p[0] and x < p[1]):\n",
|
| 2047 |
+
" y = p[2]\n",
|
| 2048 |
+
" found = True\n",
|
| 2049 |
+
" break\n",
|
| 2050 |
+
"\n",
|
| 2051 |
+
" if not found:\n",
|
| 2052 |
+
" l = len(self.points)\n",
|
| 2053 |
+
" if (x < self.points[0][0]):\n",
|
| 2054 |
+
" y = self.points[0][2]\n",
|
| 2055 |
+
" elif (x > self.points[l-1][1]):\n",
|
| 2056 |
+
" y = self.points[l-1][2]\n",
|
| 2057 |
+
" return y\n",
|
| 2058 |
+
"\n",
|
| 2059 |
+
"\n",
|
| 2060 |
+
"class DummyVarGenerator:\n",
|
| 2061 |
+
" \"\"\"\n",
|
| 2062 |
+
" dummy variable generator for categorical variable\n",
|
| 2063 |
+
" \"\"\"\n",
|
| 2064 |
+
" def __init__(self, rowSize, catValues, trueVal, falseVal, delim=None):\n",
|
| 2065 |
+
" \"\"\"\n",
|
| 2066 |
+
" initilizer\n",
|
| 2067 |
+
"\n",
|
| 2068 |
+
" Parameters\n",
|
| 2069 |
+
" rowSize : row size\n",
|
| 2070 |
+
" catValues : dictionary with field index as key and list of categorical values as value\n",
|
| 2071 |
+
" trueVal : true value, typically \"1\"\n",
|
| 2072 |
+
" falseval : false value , typically \"0\"\n",
|
| 2073 |
+
" delim : field delemeter\n",
|
| 2074 |
+
" \"\"\"\n",
|
| 2075 |
+
" self.rowSize = rowSize\n",
|
| 2076 |
+
" self.catValues = catValues\n",
|
| 2077 |
+
" numCatVar = len(catValues)\n",
|
| 2078 |
+
" colCount = 0\n",
|
| 2079 |
+
" for v in self.catValues.values():\n",
|
| 2080 |
+
" colCount += len(v)\n",
|
| 2081 |
+
" self.newRowSize = rowSize - numCatVar + colCount\n",
|
| 2082 |
+
" #print (\"new row size {}\".format(self.newRowSize))\n",
|
| 2083 |
+
" self.trueVal = trueVal\n",
|
| 2084 |
+
" self.falseVal = falseVal\n",
|
| 2085 |
+
" self.delim = delim\n",
|
| 2086 |
+
"\n",
|
| 2087 |
+
" def processRow(self, row):\n",
|
| 2088 |
+
" \"\"\"\n",
|
| 2089 |
+
" encodes categorical variables, returning as delemeter separate dstring or list\n",
|
| 2090 |
+
"\n",
|
| 2091 |
+
" Parameters\n",
|
| 2092 |
+
" row : row either delemeter separated string or list\n",
|
| 2093 |
+
" \"\"\"\n",
|
| 2094 |
+
" if self.delim is not None:\n",
|
| 2095 |
+
" rowArr = row.split(self.delim)\n",
|
| 2096 |
+
" msg = \"row does not have expected number of columns found \" + str(len(rowArr)) + \" expected \" + str(self.rowSize)\n",
|
| 2097 |
+
" assert len(rowArr) == self.rowSize, msg\n",
|
| 2098 |
+
" else:\n",
|
| 2099 |
+
" rowArr = row\n",
|
| 2100 |
+
"\n",
|
| 2101 |
+
" newRowArr = []\n",
|
| 2102 |
+
" for i in range(len(rowArr)):\n",
|
| 2103 |
+
" curVal = rowArr[i]\n",
|
| 2104 |
+
" if (i in self.catValues):\n",
|
| 2105 |
+
" values = self.catValues[i]\n",
|
| 2106 |
+
" for val in values:\n",
|
| 2107 |
+
" if val == curVal:\n",
|
| 2108 |
+
" newVal = self.trueVal\n",
|
| 2109 |
+
" else:\n",
|
| 2110 |
+
" newVal = self.falseVal\n",
|
| 2111 |
+
" newRowArr.append(newVal)\n",
|
| 2112 |
+
" else:\n",
|
| 2113 |
+
" newRowArr.append(curVal)\n",
|
| 2114 |
+
" assert len(newRowArr) == self.newRowSize, \"invalid new row size \" + str(len(newRowArr)) + \" expected \" + str(self.newRowSize)\n",
|
| 2115 |
+
" encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr\n",
|
| 2116 |
+
" return encRow\n"
|
| 2117 |
+
]
|
| 2118 |
+
}
|
| 2119 |
+
],
|
| 2120 |
+
"metadata": {
|
| 2121 |
+
"kernelspec": {
|
| 2122 |
+
"display_name": "Python 3 (ipykernel)",
|
| 2123 |
+
"language": "python",
|
| 2124 |
+
"name": "python3"
|
| 2125 |
+
},
|
| 2126 |
+
"language_info": {
|
| 2127 |
+
"codemirror_mode": {
|
| 2128 |
+
"name": "ipython",
|
| 2129 |
+
"version": 3
|
| 2130 |
+
},
|
| 2131 |
+
"file_extension": ".py",
|
| 2132 |
+
"mimetype": "text/x-python",
|
| 2133 |
+
"name": "python",
|
| 2134 |
+
"nbconvert_exporter": "python",
|
| 2135 |
+
"pygments_lexer": "ipython3",
|
| 2136 |
+
"version": "3.9.12"
|
| 2137 |
+
}
|
| 2138 |
+
},
|
| 2139 |
+
"nbformat": 4,
|
| 2140 |
+
"nbformat_minor": 5
|
| 2141 |
+
}
|
lib/mlutil.ipynb
ADDED
|
@@ -0,0 +1,1297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "2d05ce02",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"import numpy as np\n",
|
| 13 |
+
"from sklearn import preprocessing\n",
|
| 14 |
+
"from sklearn import metrics\n",
|
| 15 |
+
"from sklearn.datasets import make_blobs\n",
|
| 16 |
+
"from sklearn.datasets import make_classification\n",
|
| 17 |
+
"import random\n",
|
| 18 |
+
"from math import *\n",
|
| 19 |
+
"from decimal import Decimal\n",
|
| 20 |
+
"import statistics\n",
|
| 21 |
+
"import jprops\n",
|
| 22 |
+
"from Levenshtein import distance as ld\n",
|
| 23 |
+
"from util import *\n",
|
| 24 |
+
"from sampler import *\n",
|
| 25 |
+
"\n",
|
| 26 |
+
"class Configuration:\n",
|
| 27 |
+
" \"\"\"\n",
|
| 28 |
+
" Configuration management. Supports default value, mandatory value and typed value.\n",
|
| 29 |
+
" \"\"\"\n",
|
| 30 |
+
" def __init__(self, configFile, defValues, verbose=False):\n",
|
| 31 |
+
" \"\"\"\n",
|
| 32 |
+
" initializer\n",
|
| 33 |
+
"\n",
|
| 34 |
+
" Parameters\n",
|
| 35 |
+
" configFile : config file path\n",
|
| 36 |
+
" defValues : dictionary of default values\n",
|
| 37 |
+
" verbose : verbosity flag\n",
|
| 38 |
+
" \"\"\"\n",
|
| 39 |
+
" configs = {}\n",
|
| 40 |
+
" with open(configFile) as fp:\n",
|
| 41 |
+
" for key, value in jprops.iter_properties(fp):\n",
|
| 42 |
+
" configs[key] = value\n",
|
| 43 |
+
" self.configs = configs\n",
|
| 44 |
+
" self.defValues = defValues\n",
|
| 45 |
+
" self.verbose = verbose\n",
|
| 46 |
+
"\n",
|
| 47 |
+
" def override(self, configFile):\n",
|
| 48 |
+
" \"\"\"\n",
|
| 49 |
+
" over ride configuration from file\n",
|
| 50 |
+
"\n",
|
| 51 |
+
" Parameters\n",
|
| 52 |
+
" configFile : override config file path\n",
|
| 53 |
+
" \"\"\"\n",
|
| 54 |
+
" with open(configFile) as fp:\n",
|
| 55 |
+
" for key, value in jprops.iter_properties(fp):\n",
|
| 56 |
+
" self.configs[key] = value\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"\n",
|
| 59 |
+
" def setParam(self, name, value):\n",
|
| 60 |
+
" \"\"\"\n",
|
| 61 |
+
" override individual configuration\n",
|
| 62 |
+
" Parameters\n",
|
| 63 |
+
" name : config param name\n",
|
| 64 |
+
" value : config param value\n",
|
| 65 |
+
" \"\"\"\n",
|
| 66 |
+
" self.configs[name] = value\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"\n",
|
| 69 |
+
" def getStringConfig(self, name):\n",
|
| 70 |
+
" \"\"\"\n",
|
| 71 |
+
" get string param\n",
|
| 72 |
+
" Parameters\n",
|
| 73 |
+
" name : config param name\n",
|
| 74 |
+
" \"\"\"\n",
|
| 75 |
+
" if self.isNone(name):\n",
|
| 76 |
+
" val = (None, False)\n",
|
| 77 |
+
" elif self.isDefault(name):\n",
|
| 78 |
+
" val = (self.handleDefault(name), True)\n",
|
| 79 |
+
" else:\n",
|
| 80 |
+
" val = (self.configs[name], False)\n",
|
| 81 |
+
" if self.verbose:\n",
|
| 82 |
+
" print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
|
| 83 |
+
" return val\n",
|
| 84 |
+
"\n",
|
| 85 |
+
"\n",
|
| 86 |
+
" def getIntConfig(self, name):\n",
|
| 87 |
+
" \"\"\"\n",
|
| 88 |
+
" get int param\n",
|
| 89 |
+
" Parameters\n",
|
| 90 |
+
" name : config param name\n",
|
| 91 |
+
" \"\"\"\n",
|
| 92 |
+
" #print \"%s %s\" %(name,self.configs[name])\n",
|
| 93 |
+
" if self.isNone(name):\n",
|
| 94 |
+
" val = (None, False)\n",
|
| 95 |
+
" elif self.isDefault(name):\n",
|
| 96 |
+
" val = (self.handleDefault(name), True)\n",
|
| 97 |
+
" else:\n",
|
| 98 |
+
" val = (int(self.configs[name]), False)\n",
|
| 99 |
+
" if self.verbose:\n",
|
| 100 |
+
" print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
|
| 101 |
+
" return val\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"\n",
|
| 104 |
+
" def getFloatConfig(self, name):\n",
|
| 105 |
+
" \"\"\"\n",
|
| 106 |
+
" get float param\n",
|
| 107 |
+
" Parameters\n",
|
| 108 |
+
" name : config param name\n",
|
| 109 |
+
" \"\"\"\n",
|
| 110 |
+
" #print \"%s %s\" %(name,self.configs[name])\n",
|
| 111 |
+
" if self.isNone(name):\n",
|
| 112 |
+
" val = (None, False)\n",
|
| 113 |
+
" elif self.isDefault(name):\n",
|
| 114 |
+
" val = (self.handleDefault(name), True)\n",
|
| 115 |
+
" else:\n",
|
| 116 |
+
" val = (float(self.configs[name]), False)\n",
|
| 117 |
+
" if self.verbose:\n",
|
| 118 |
+
" print( \"{} {} {:06.3f}\".format(name, self.configs[name], val[0]))\n",
|
| 119 |
+
" return val\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"\n",
|
| 122 |
+
" def getBooleanConfig(self, name):\n",
|
| 123 |
+
" \"\"\"\n",
|
| 124 |
+
" #get boolean param\n",
|
| 125 |
+
" Parameters\n",
|
| 126 |
+
" name : config param name\n",
|
| 127 |
+
" \"\"\"\n",
|
| 128 |
+
" if self.isNone(name):\n",
|
| 129 |
+
" val = (None, False)\n",
|
| 130 |
+
" elif self.isDefault(name):\n",
|
| 131 |
+
" val = (self.handleDefault(name), True)\n",
|
| 132 |
+
" else:\n",
|
| 133 |
+
" bVal = self.configs[name].lower() == \"true\"\n",
|
| 134 |
+
" val = (bVal, False)\n",
|
| 135 |
+
" if self.verbose:\n",
|
| 136 |
+
" print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
|
| 137 |
+
" return val\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"\n",
|
| 140 |
+
" def getIntListConfig(self, name, delim=\",\"):\n",
|
| 141 |
+
" \"\"\"\n",
|
| 142 |
+
" get int list param\n",
|
| 143 |
+
" Parameters\n",
|
| 144 |
+
" name : config param name\n",
|
| 145 |
+
" delim : delemeter\n",
|
| 146 |
+
" \"\"\"\n",
|
| 147 |
+
" if self.isNone(name):\n",
|
| 148 |
+
" val = (None, False)\n",
|
| 149 |
+
" elif self.isDefault(name):\n",
|
| 150 |
+
" val = (self.handleDefault(name), True)\n",
|
| 151 |
+
" else:\n",
|
| 152 |
+
" delSepStr = self.getStringConfig(name)\n",
|
| 153 |
+
"\n",
|
| 154 |
+
" #specified as range\n",
|
| 155 |
+
" intList = strListOrRangeToIntArray(delSepStr[0])\n",
|
| 156 |
+
" val =(intList, delSepStr[1])\n",
|
| 157 |
+
" return val\n",
|
| 158 |
+
"\n",
|
| 159 |
+
" def getFloatListConfig(self, name, delim=\",\"):\n",
|
| 160 |
+
" \"\"\"\n",
|
| 161 |
+
" get float list param\n",
|
| 162 |
+
" Parameters\n",
|
| 163 |
+
" name : config param name\n",
|
| 164 |
+
" delim : delemeter\n",
|
| 165 |
+
" \"\"\"\n",
|
| 166 |
+
" delSepStr = self.getStringConfig(name)\n",
|
| 167 |
+
" if self.isNone(name):\n",
|
| 168 |
+
" val = (None, False)\n",
|
| 169 |
+
" elif self.isDefault(name):\n",
|
| 170 |
+
" val = (self.handleDefault(name), True)\n",
|
| 171 |
+
" else:\n",
|
| 172 |
+
" flList = strToFloatArray(delSepStr[0], delim)\n",
|
| 173 |
+
" val =(flList, delSepStr[1])\n",
|
| 174 |
+
" return val\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"\n",
|
| 177 |
+
" def getStringListConfig(self, name, delim=\",\"):\n",
|
| 178 |
+
" \"\"\"\n",
|
| 179 |
+
" get string list param\n",
|
| 180 |
+
" Parameters\n",
|
| 181 |
+
" name : config param name\n",
|
| 182 |
+
" delim : delemeter\n",
|
| 183 |
+
" \"\"\"\n",
|
| 184 |
+
" delSepStr = self.getStringConfig(name)\n",
|
| 185 |
+
" if self.isNone(name):\n",
|
| 186 |
+
" val = (None, False)\n",
|
| 187 |
+
" elif self.isDefault(name):\n",
|
| 188 |
+
" val = (self.handleDefault(name), True)\n",
|
| 189 |
+
" else:\n",
|
| 190 |
+
" strList = delSepStr[0].split(delim)\n",
|
| 191 |
+
" val = (strList, delSepStr[1])\n",
|
| 192 |
+
" return val\n",
|
| 193 |
+
"\n",
|
| 194 |
+
" def handleDefault(self, name):\n",
|
| 195 |
+
" \"\"\"\n",
|
| 196 |
+
" handles default\n",
|
| 197 |
+
" Parameters\n",
|
| 198 |
+
" name : config param name\n",
|
| 199 |
+
" \"\"\"\n",
|
| 200 |
+
" dVal = self.defValues[name]\n",
|
| 201 |
+
" if (dVal[1] is None):\n",
|
| 202 |
+
" val = dVal[0]\n",
|
| 203 |
+
" else:\n",
|
| 204 |
+
" raise ValueError(dVal[1])\n",
|
| 205 |
+
" return val\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"\n",
|
| 208 |
+
" def isNone(self, name):\n",
|
| 209 |
+
" \"\"\"\n",
|
| 210 |
+
" true is value is None\t\n",
|
| 211 |
+
" Parameters\n",
|
| 212 |
+
" name : config param name\n",
|
| 213 |
+
" \"\"\"\n",
|
| 214 |
+
" return self.configs[name].lower() == \"none\"\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"\n",
|
| 217 |
+
" def isDefault(self, name):\n",
|
| 218 |
+
" \"\"\"\n",
|
| 219 |
+
" true if the value is default\t\n",
|
| 220 |
+
" Parameters\n",
|
| 221 |
+
" name : config param name\n",
|
| 222 |
+
" \"\"\"\n",
|
| 223 |
+
" de = self.configs[name] == \"_\"\n",
|
| 224 |
+
" #print de\n",
|
| 225 |
+
" return de\n",
|
| 226 |
+
"\n",
|
| 227 |
+
"\n",
|
| 228 |
+
" def eitherOrStringConfig(self, firstName, secondName):\n",
|
| 229 |
+
" \"\"\"\n",
|
| 230 |
+
" returns one of two string parameters\t\n",
|
| 231 |
+
" Parameters\n",
|
| 232 |
+
" firstName : first parameter name\n",
|
| 233 |
+
" secondName : second parameter name\t\n",
|
| 234 |
+
" \"\"\"\n",
|
| 235 |
+
" if not self.isNone(firstName):\n",
|
| 236 |
+
" first = self.getStringConfig(firstName)[0]\n",
|
| 237 |
+
" second = None\n",
|
| 238 |
+
" if not self.isNone(secondName):\n",
|
| 239 |
+
" raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n",
|
| 240 |
+
" else:\n",
|
| 241 |
+
" if not self.isNone(secondName):\n",
|
| 242 |
+
" second = self.getStringConfig(secondtName)[0]\n",
|
| 243 |
+
" first = None\n",
|
| 244 |
+
" else:\n",
|
| 245 |
+
" raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n",
|
| 246 |
+
" return (first, second)\n",
|
| 247 |
+
"\n",
|
| 248 |
+
"\n",
|
| 249 |
+
" def eitherOrIntConfig(self, firstName, secondName):\n",
|
| 250 |
+
" \"\"\"\n",
|
| 251 |
+
" returns one of two int parameters\t\n",
|
| 252 |
+
" Parameters\n",
|
| 253 |
+
" firstName : first parameter name\n",
|
| 254 |
+
" secondName : second parameter name\t\n",
|
| 255 |
+
" \"\"\"\n",
|
| 256 |
+
" if not self.isNone(firstName):\n",
|
| 257 |
+
" first = self.getIntConfig(firstName)[0]\n",
|
| 258 |
+
" second = None\n",
|
| 259 |
+
" if not self.isNone(secondName):\n",
|
| 260 |
+
" raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n",
|
| 261 |
+
" else:\n",
|
| 262 |
+
" if not self.isNone(secondName):\n",
|
| 263 |
+
" second = self.getIntConfig(secondsName)[0]\n",
|
| 264 |
+
" first = None\n",
|
| 265 |
+
" else:\n",
|
| 266 |
+
" raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n",
|
| 267 |
+
" return (first, second)\n",
|
| 268 |
+
"\n",
|
| 269 |
+
"\n",
|
| 270 |
+
"class CatLabelGenerator:\n",
|
| 271 |
+
" \"\"\"\n",
|
| 272 |
+
" label generator for categorical variables\n",
|
| 273 |
+
" \"\"\"\n",
|
| 274 |
+
" def __init__(self, catValues, delim):\n",
|
| 275 |
+
" \"\"\"\n",
|
| 276 |
+
" initilizers\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" Parameters\n",
|
| 279 |
+
" catValues : dictionary of categorical values\n",
|
| 280 |
+
" delim : delemeter\n",
|
| 281 |
+
" \"\"\"\n",
|
| 282 |
+
" self.encoders = {}\n",
|
| 283 |
+
" self.catValues = catValues\n",
|
| 284 |
+
" self.delim = delim\n",
|
| 285 |
+
" for k in self.catValues.keys():\t\n",
|
| 286 |
+
" le = preprocessing.LabelEncoder()\t\n",
|
| 287 |
+
" le.fit(self.catValues[k])\n",
|
| 288 |
+
" self.encoders[k] = le\n",
|
| 289 |
+
"\n",
|
| 290 |
+
" def processRow(self, row):\t\n",
|
| 291 |
+
" \"\"\"\n",
|
| 292 |
+
" encode row categorical values\n",
|
| 293 |
+
"\n",
|
| 294 |
+
" Parameters:\n",
|
| 295 |
+
" row : data row\n",
|
| 296 |
+
" \"\"\"\n",
|
| 297 |
+
" #print row\n",
|
| 298 |
+
" rowArr = row.split(self.delim)\n",
|
| 299 |
+
" for i in range(len(rowArr)):\n",
|
| 300 |
+
" if (i in self.catValues):\n",
|
| 301 |
+
" curVal = rowArr[i]\n",
|
| 302 |
+
" assert curVal in self.catValues[i], \"categorival value invalid\"\n",
|
| 303 |
+
" encVal = self.encoders[i].transform([curVal])\n",
|
| 304 |
+
" rowArr[i] = str(encVal[0])\n",
|
| 305 |
+
" return self.delim.join(rowArr)\t\t\n",
|
| 306 |
+
"\n",
|
| 307 |
+
" def getOrigLabels(self, indx):\n",
|
| 308 |
+
" \"\"\"\n",
|
| 309 |
+
" get original labels\n",
|
| 310 |
+
"\n",
|
| 311 |
+
" Parameters:\n",
|
| 312 |
+
" indx : column index\n",
|
| 313 |
+
" \"\"\"\n",
|
| 314 |
+
" return self.encoders[indx].classes_\t\n",
|
| 315 |
+
"\n",
|
| 316 |
+
"\n",
|
| 317 |
+
"class SupvLearningDataGenerator:\n",
|
| 318 |
+
" \"\"\"\n",
|
| 319 |
+
" data generator for supervised learning\n",
|
| 320 |
+
" \"\"\"\n",
|
| 321 |
+
" def __init__(self, configFile):\n",
|
| 322 |
+
" \"\"\"\n",
|
| 323 |
+
" initilizers\n",
|
| 324 |
+
"\n",
|
| 325 |
+
" Parameters\n",
|
| 326 |
+
" configFile : config file path\n",
|
| 327 |
+
" \"\"\"\n",
|
| 328 |
+
" defValues = dict()\n",
|
| 329 |
+
" defValues[\"common.num.samp\"] = (100, None)\n",
|
| 330 |
+
" defValues[\"common.num.feat\"] = (5, None)\n",
|
| 331 |
+
" defValues[\"common.feat.trans\"] = (None, None)\n",
|
| 332 |
+
" defValues[\"common.feat.types\"] = (None, \"missing feature types\")\n",
|
| 333 |
+
" defValues[\"common.cat.feat.distr\"] = (None, None)\n",
|
| 334 |
+
" defValues[\"common.output.precision\"] = (3, None)\n",
|
| 335 |
+
" defValues[\"common.error\"] = (0.01, None)\n",
|
| 336 |
+
" defValues[\"class.gen.technique\"] = (\"blob\", None)\n",
|
| 337 |
+
" defValues[\"class.num.feat.informative\"] = (2, None)\n",
|
| 338 |
+
" defValues[\"class.num.feat.redundant\"] = (2, None)\n",
|
| 339 |
+
" defValues[\"class.num.feat.repeated\"] = (0, None)\n",
|
| 340 |
+
" defValues[\"class.num.feat.cat\"] = (0, None)\n",
|
| 341 |
+
" defValues[\"class.num.class\"] = (2, None)\n",
|
| 342 |
+
"\n",
|
| 343 |
+
" self.config = Configuration(configFile, defValues)\n",
|
| 344 |
+
"\n",
|
| 345 |
+
" def genClassifierData(self):\n",
|
| 346 |
+
" \"\"\"\n",
|
| 347 |
+
" generates classifier data\n",
|
| 348 |
+
" \"\"\"\n",
|
| 349 |
+
" nsamp = self.config.getIntConfig(\"common.num.samp\")[0]\n",
|
| 350 |
+
" nfeat = self.config.getIntConfig(\"common.num.feat\")[0]\n",
|
| 351 |
+
" nclass = self.config.getIntConfig(\"class.num.class\")[0]\n",
|
| 352 |
+
" #transform with shift and scale\n",
|
| 353 |
+
" ftrans = self.config.getFloatListConfig(\"common.feat.trans\")[0]\n",
|
| 354 |
+
" feTrans = dict()\n",
|
| 355 |
+
" for i in range(0, len(ftrans), 2):\n",
|
| 356 |
+
" tr = (ftrans[i], ftrans[i+1])\n",
|
| 357 |
+
" indx = int(i/2)\n",
|
| 358 |
+
" feTrans[indx] = tr\n",
|
| 359 |
+
"\n",
|
| 360 |
+
" ftypes = self.config.getStringListConfig(\"common.feat.types\")[0]\n",
|
| 361 |
+
"\n",
|
| 362 |
+
" # categorical feature distribution\n",
|
| 363 |
+
" feCatDist = dict()\n",
|
| 364 |
+
" fcatdl = self.config.getStringListConfig(\"common.cat.feat.distr\")[0]\n",
|
| 365 |
+
" for fcatds in fcatdl:\n",
|
| 366 |
+
" fcatd = fcatds.split(\":\")\n",
|
| 367 |
+
" feInd = int(fcatd[0])\n",
|
| 368 |
+
" clVal = int(fcatd[1])\n",
|
| 369 |
+
" key = (feInd, clVal)\t\t#feature index and class value\n",
|
| 370 |
+
" dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))\n",
|
| 371 |
+
" feCatDist[key] = CategoricalRejectSampler(*dist)\n",
|
| 372 |
+
"\n",
|
| 373 |
+
" #shift and scale\n",
|
| 374 |
+
" genTechnique = self.config.getStringConfig(\"class.gen.technique\")[0]\n",
|
| 375 |
+
" error = self.config.getFloatConfig(\"common.error\")[0]\n",
|
| 376 |
+
" if genTechnique == \"blob\":\n",
|
| 377 |
+
" features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)\n",
|
| 378 |
+
" for i in range(nsamp):\t\t\t#shift and scale\n",
|
| 379 |
+
" for j in range(nfeat):\n",
|
| 380 |
+
" tr = feTrans[j]\n",
|
| 381 |
+
" features[i,j] = (features[i,j] + tr[0]) * tr[1]\n",
|
| 382 |
+
" claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))\n",
|
| 383 |
+
" elif genTechnique == \"classify\":\n",
|
| 384 |
+
" nfeatInfo = self.config.getIntConfig(\"class.num.feat.informative\")[0]\n",
|
| 385 |
+
" nfeatRed = self.config.getIntConfig(\"class.num.feat.redundant\")[0]\n",
|
| 386 |
+
" nfeatRep = self.config.getIntConfig(\"class.num.feat.repeated\")[0]\n",
|
| 387 |
+
" shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))\n",
|
| 388 |
+
" scales = list(map(lambda i : feTrans[i][1], range(nfeat)))\n",
|
| 389 |
+
" features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed, \n",
|
| 390 |
+
" n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)\n",
|
| 391 |
+
" else:\n",
|
| 392 |
+
" raise \"invalid genaration technique\"\n",
|
| 393 |
+
"\n",
|
| 394 |
+
" # add categorical features and format\n",
|
| 395 |
+
" nCatFeat = self.config.getIntConfig(\"class.num.feat.cat\")[0]\n",
|
| 396 |
+
" prec = self.config.getIntConfig(\"common.output.precision\")[0]\n",
|
| 397 |
+
" for f , c in zip(features, claz):\n",
|
| 398 |
+
" nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))\n",
|
| 399 |
+
" if nCatFeat > 0:\n",
|
| 400 |
+
" cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))\n",
|
| 401 |
+
" rec = \",\".join(nfs) + \",\" + \",\".join(cfs) + \",\" + str(c)\n",
|
| 402 |
+
" else:\n",
|
| 403 |
+
" rec = \",\".join(nfs) + \",\" + str(c)\n",
|
| 404 |
+
" yield rec\n",
|
| 405 |
+
"\n",
|
| 406 |
+
" def numFeToStr(self, fv, ft, prec):\n",
|
| 407 |
+
" \"\"\"\n",
|
| 408 |
+
" nummeric feature value to string\n",
|
| 409 |
+
"\n",
|
| 410 |
+
" Parameters\n",
|
| 411 |
+
" fv : field value\n",
|
| 412 |
+
" ft : field data type\n",
|
| 413 |
+
" prec : precision\n",
|
| 414 |
+
" \"\"\"\n",
|
| 415 |
+
" if ft == \"float\":\n",
|
| 416 |
+
" s = formatFloat(prec, fv)\n",
|
| 417 |
+
" elif ft ==\"int\":\n",
|
| 418 |
+
" s = str(int(fv))\n",
|
| 419 |
+
" else:\t\t\n",
|
| 420 |
+
" raise \"invalid type expecting float or int\"\n",
|
| 421 |
+
" return s\n",
|
| 422 |
+
"\n",
|
| 423 |
+
" def catFe(self, i, cv, ft, feCatDist):\n",
|
| 424 |
+
" \"\"\"\n",
|
| 425 |
+
" generate categorical feature\n",
|
| 426 |
+
"\n",
|
| 427 |
+
" Parameters\n",
|
| 428 |
+
" i : col index\n",
|
| 429 |
+
" cv : class value\n",
|
| 430 |
+
" ft : field data type\n",
|
| 431 |
+
" feCatDist : cat value distribution\n",
|
| 432 |
+
" \"\"\"\n",
|
| 433 |
+
" if ft == \"cat\":\n",
|
| 434 |
+
" key = (i, cv)\n",
|
| 435 |
+
" s = feCatDist[key].sample()\n",
|
| 436 |
+
" else:\t\t\n",
|
| 437 |
+
" raise \"invalid type expecting categorical\"\n",
|
| 438 |
+
" return s\n",
|
| 439 |
+
"\n",
|
| 440 |
+
"\n",
|
| 441 |
+
"\n",
|
| 442 |
+
"def loadDataFile(file, delim, cols, colIndices):\n",
|
| 443 |
+
" \"\"\"\n",
|
| 444 |
+
" loads delim separated file and extracts columns\n",
|
| 445 |
+
" Parameters\n",
|
| 446 |
+
" file : file path\n",
|
| 447 |
+
" delim : delemeter\n",
|
| 448 |
+
" cols : columns to use from file\n",
|
| 449 |
+
" colIndices ; columns to extract\n",
|
| 450 |
+
" \"\"\"\n",
|
| 451 |
+
" data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
|
| 452 |
+
" extrData = data[:,colIndices]\n",
|
| 453 |
+
" return (data, extrData)\n",
|
| 454 |
+
"\n",
|
| 455 |
+
"def loadFeatDataFile(file, delim, cols):\n",
|
| 456 |
+
" \"\"\"\n",
|
| 457 |
+
" loads delim separated file and extracts columns\n",
|
| 458 |
+
"\n",
|
| 459 |
+
" Parameters\n",
|
| 460 |
+
" file : file path\n",
|
| 461 |
+
" delim : delemeter\n",
|
| 462 |
+
" cols : columns to use from file\n",
|
| 463 |
+
" \"\"\"\n",
|
| 464 |
+
" data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
|
| 465 |
+
" return data\n",
|
| 466 |
+
"\n",
|
| 467 |
+
"def extrColumns(arr, columns):\n",
|
| 468 |
+
" \"\"\"\n",
|
| 469 |
+
" extracts columns\n",
|
| 470 |
+
"\n",
|
| 471 |
+
" Parameters\n",
|
| 472 |
+
" arr : 2D array\n",
|
| 473 |
+
" columns : columns\n",
|
| 474 |
+
" \"\"\"\n",
|
| 475 |
+
" return arr[:, columns]\n",
|
| 476 |
+
"\n",
|
| 477 |
+
"def subSample(featData, clsData, subSampleRate, withReplacement):\n",
|
| 478 |
+
" \"\"\"\n",
|
| 479 |
+
" subsample feature and class label data\t\n",
|
| 480 |
+
" Parameters\n",
|
| 481 |
+
" featData : 2D array of feature data\n",
|
| 482 |
+
" clsData : arrray of class labels\n",
|
| 483 |
+
" subSampleRate : fraction to be sampled\n",
|
| 484 |
+
" withReplacement : true if sampling with replacement\n",
|
| 485 |
+
" \"\"\"\n",
|
| 486 |
+
" sampSize = int(featData.shape[0] * subSampleRate)\n",
|
| 487 |
+
" sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)\n",
|
| 488 |
+
" sampFeat = featData[sampledIndx]\n",
|
| 489 |
+
" sampCls = clsData[sampledIndx]\n",
|
| 490 |
+
" return(sampFeat, sampCls)\n",
|
| 491 |
+
"\n",
|
| 492 |
+
"def euclideanDistance(x,y):\n",
|
| 493 |
+
" \"\"\"\n",
|
| 494 |
+
" euclidean distance\n",
|
| 495 |
+
" Parameters\n",
|
| 496 |
+
" x : first vector\n",
|
| 497 |
+
" y : second fvector\n",
|
| 498 |
+
" \"\"\"\n",
|
| 499 |
+
" return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))\n",
|
| 500 |
+
"\n",
|
| 501 |
+
"def squareRooted(x):\n",
|
| 502 |
+
" \"\"\"\n",
|
| 503 |
+
" square root of sum square\n",
|
| 504 |
+
" Parameters\n",
|
| 505 |
+
" x : data vector\n",
|
| 506 |
+
" \"\"\"\n",
|
| 507 |
+
" return round(sqrt(sum([a*a for a in x])),3)\n",
|
| 508 |
+
"\n",
|
| 509 |
+
"def cosineSimilarity(x,y):\n",
|
| 510 |
+
" \"\"\"\n",
|
| 511 |
+
" cosine similarity\n",
|
| 512 |
+
"\n",
|
| 513 |
+
" Parameters\n",
|
| 514 |
+
" x : first vector\n",
|
| 515 |
+
" y : second fvector\n",
|
| 516 |
+
" \"\"\"\n",
|
| 517 |
+
" numerator = sum(a*b for a,b in zip(x,y))\n",
|
| 518 |
+
" denominator = squareRooted(x) * squareRooted(y)\n",
|
| 519 |
+
" return round(numerator / float(denominator), 3)\n",
|
| 520 |
+
"\n",
|
| 521 |
+
"def cosineDistance(x,y):\n",
|
| 522 |
+
" \"\"\"\n",
|
| 523 |
+
" cosine distance\n",
|
| 524 |
+
" Parameters\n",
|
| 525 |
+
" x : first vector\n",
|
| 526 |
+
" y : second fvector\n",
|
| 527 |
+
" \"\"\"\n",
|
| 528 |
+
" return 1.0 - cosineSimilarity(x,y)\n",
|
| 529 |
+
"\n",
|
| 530 |
+
"def manhattanDistance(x,y):\n",
|
| 531 |
+
" \"\"\"\n",
|
| 532 |
+
" manhattan distance\n",
|
| 533 |
+
" Parameters\n",
|
| 534 |
+
" x : first vector\n",
|
| 535 |
+
" y : second fvector\n",
|
| 536 |
+
" \"\"\"\n",
|
| 537 |
+
" return sum(abs(a-b) for a,b in zip(x,y))\n",
|
| 538 |
+
"\n",
|
| 539 |
+
"def nthRoot(value, nRoot):\n",
|
| 540 |
+
" \"\"\"\n",
|
| 541 |
+
" nth root\n",
|
| 542 |
+
" Parameters\n",
|
| 543 |
+
" value : data value\n",
|
| 544 |
+
" nRoot : root\n",
|
| 545 |
+
" \"\"\"\n",
|
| 546 |
+
" rootValue = 1/float(nRoot)\n",
|
| 547 |
+
" return round (Decimal(value) ** Decimal(rootValue),3)\n",
|
| 548 |
+
"\n",
|
| 549 |
+
"def minkowskiDistance(x,y,pValue):\n",
|
| 550 |
+
" \"\"\"\n",
|
| 551 |
+
" minkowski distance\n",
|
| 552 |
+
" Parameters\n",
|
| 553 |
+
" x : first vector\n",
|
| 554 |
+
" y : second fvector\n",
|
| 555 |
+
" pValue : power factor\n",
|
| 556 |
+
" \"\"\"\n",
|
| 557 |
+
" return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)\n",
|
| 558 |
+
"\n",
|
| 559 |
+
"def jaccardSimilarityX(x,y):\n",
|
| 560 |
+
" \"\"\"\n",
|
| 561 |
+
" jaccard similarity\n",
|
| 562 |
+
" Parameters\n",
|
| 563 |
+
" x : first vector\n",
|
| 564 |
+
" y : second fvector\n",
|
| 565 |
+
" \"\"\"\n",
|
| 566 |
+
" intersectionCardinality = len(set.intersection(*[set(x), set(y)]))\n",
|
| 567 |
+
" unionCardinality = len(set.union(*[set(x), set(y)]))\n",
|
| 568 |
+
" return intersectionCardinality/float(unionCardinality)\n",
|
| 569 |
+
"\n",
|
| 570 |
+
"def jaccardSimilarity(x,y,wx=1.0,wy=1.0):\n",
|
| 571 |
+
" \"\"\"\n",
|
| 572 |
+
" jaccard similarity\n",
|
| 573 |
+
"\n",
|
| 574 |
+
" Parameters\n",
|
| 575 |
+
" x : first vector\n",
|
| 576 |
+
" y : second fvector\n",
|
| 577 |
+
" wx : weight for x\n",
|
| 578 |
+
" wy : weight for y\n",
|
| 579 |
+
" \"\"\"\n",
|
| 580 |
+
" sx = set(x)\n",
|
| 581 |
+
" sy = set(y)\n",
|
| 582 |
+
" sxyInt = sx.intersection(sy)\n",
|
| 583 |
+
" intCardinality = len(sxyInt)\n",
|
| 584 |
+
" sxIntDiff = sx.difference(sxyInt)\n",
|
| 585 |
+
" syIntDiff = sy.difference(sxyInt)\n",
|
| 586 |
+
" unionCardinality = len(sx.union(sy))\n",
|
| 587 |
+
" return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))\n",
|
| 588 |
+
"\n",
|
| 589 |
+
"def levenshteinSimilarity(s1, s2):\n",
|
| 590 |
+
" \"\"\"\n",
|
| 591 |
+
" Levenshtein similarity for strings\n",
|
| 592 |
+
"\n",
|
| 593 |
+
" Parameters\n",
|
| 594 |
+
" sx : first string\n",
|
| 595 |
+
" sy : second string\n",
|
| 596 |
+
" \"\"\"\n",
|
| 597 |
+
" assert type(s1) == str and type(s2) == str, \"Levenshtein similarity is for string only\"\n",
|
| 598 |
+
" d = ld(s1,s2)\n",
|
| 599 |
+
" #print(d)\n",
|
| 600 |
+
" l = max(len(s1),len(s2))\n",
|
| 601 |
+
" d = 1.0 - min(d/l, 1.0)\n",
|
| 602 |
+
" return d\t\n",
|
| 603 |
+
"\n",
|
| 604 |
+
"def norm(values, po=2):\n",
|
| 605 |
+
" \"\"\"\n",
|
| 606 |
+
" norm\n",
|
| 607 |
+
" Parameters\n",
|
| 608 |
+
" values : list of values\n",
|
| 609 |
+
" po : power\n",
|
| 610 |
+
" \"\"\"\n",
|
| 611 |
+
" no = sum(list(map(lambda v: pow(v,po), values)))\n",
|
| 612 |
+
" no = pow(no,1.0/po)\n",
|
| 613 |
+
" return list(map(lambda v: v/no, values))\n",
|
| 614 |
+
"\n",
|
| 615 |
+
"def createOneHotVec(size, indx = -1):\n",
|
| 616 |
+
" \"\"\"\n",
|
| 617 |
+
" random one hot vector\n",
|
| 618 |
+
"\n",
|
| 619 |
+
" Parameters\n",
|
| 620 |
+
" size : vector size\n",
|
| 621 |
+
" indx : one hot position\n",
|
| 622 |
+
" \"\"\"\n",
|
| 623 |
+
" vec = [0] * size\n",
|
| 624 |
+
" s = random.randint(0, size - 1) if indx < 0 else indx\n",
|
| 625 |
+
" vec[s] = 1\n",
|
| 626 |
+
" return vec\n",
|
| 627 |
+
"\n",
|
| 628 |
+
"def createAllOneHotVec(size):\n",
|
| 629 |
+
" \"\"\"\n",
|
| 630 |
+
" create all one hot vectors\n",
|
| 631 |
+
"\n",
|
| 632 |
+
" Parameters\n",
|
| 633 |
+
" size : vector size and no of vectors\n",
|
| 634 |
+
" \"\"\"\n",
|
| 635 |
+
" vecs = list()\n",
|
| 636 |
+
" for i in range(size):\n",
|
| 637 |
+
" vec = [0] * size\n",
|
| 638 |
+
" vec[i] = 1\n",
|
| 639 |
+
" vecs.append(vec)\n",
|
| 640 |
+
" return vecs\n",
|
| 641 |
+
"\n",
|
| 642 |
+
"def blockShuffle(data, blockSize):\n",
|
| 643 |
+
" \"\"\"\n",
|
| 644 |
+
" block shuffle \t\n",
|
| 645 |
+
"\n",
|
| 646 |
+
" Parameters\n",
|
| 647 |
+
" data : list data\n",
|
| 648 |
+
" blockSize : block size\n",
|
| 649 |
+
" \"\"\"\n",
|
| 650 |
+
" numBlock = int(len(data) / blockSize)\n",
|
| 651 |
+
" remain = len(data) % blockSize\n",
|
| 652 |
+
" numBlock += (1 if remain > 0 else 0)\n",
|
| 653 |
+
" shuffled = list()\n",
|
| 654 |
+
" for i in range(numBlock):\n",
|
| 655 |
+
" b = random.randint(0, numBlock-1)\n",
|
| 656 |
+
" beg = b * blockSize\n",
|
| 657 |
+
" if (b < numBlock-1):\n",
|
| 658 |
+
" end = beg + blockSize\n",
|
| 659 |
+
" shuffled.extend(data[beg:end])\t\t\n",
|
| 660 |
+
" else:\n",
|
| 661 |
+
" shuffled.extend(data[beg:])\n",
|
| 662 |
+
" return shuffled\t\n",
|
| 663 |
+
"\n",
|
| 664 |
+
"def shuffle(data, numShuffle):\n",
|
| 665 |
+
" \"\"\"\n",
|
| 666 |
+
" shuffle data by randonm swapping\n",
|
| 667 |
+
"\n",
|
| 668 |
+
" Parameters\n",
|
| 669 |
+
" data : list data\n",
|
| 670 |
+
" numShuffle : no of pairwise swaps\n",
|
| 671 |
+
" \"\"\"\n",
|
| 672 |
+
" sz = len(data)\n",
|
| 673 |
+
" if numShuffle is None:\n",
|
| 674 |
+
" numShuffle = int(sz / 2)\n",
|
| 675 |
+
" for i in range(numShuffle):\n",
|
| 676 |
+
" fi = random.randint(0, sz -1)\n",
|
| 677 |
+
" se = random.randint(0, sz -1)\n",
|
| 678 |
+
" tmp = data[fi]\n",
|
| 679 |
+
" data[fi] = data[se]\n",
|
| 680 |
+
" data[se] = tmp\t\n",
|
| 681 |
+
"\n",
|
| 682 |
+
"def randomWalk(size, start, lowStep, highStep):\n",
|
| 683 |
+
" \"\"\"\n",
|
| 684 |
+
" random walk\t\n",
|
| 685 |
+
"\n",
|
| 686 |
+
" Parameters\n",
|
| 687 |
+
" size : list data\n",
|
| 688 |
+
" start : initial position\n",
|
| 689 |
+
" lowStep : step min\n",
|
| 690 |
+
" highStep : step max\n",
|
| 691 |
+
" \"\"\"\n",
|
| 692 |
+
" cur = start\n",
|
| 693 |
+
" for i in range(size):\n",
|
| 694 |
+
" yield cur\n",
|
| 695 |
+
" cur += randomFloat(lowStep, highStep)\n",
|
| 696 |
+
"\n",
|
| 697 |
+
"def binaryEcodeCategorical(values, value):\n",
|
| 698 |
+
" \"\"\"\n",
|
| 699 |
+
" one hot binary encoding\t\n",
|
| 700 |
+
"\n",
|
| 701 |
+
" Parameters\n",
|
| 702 |
+
" values : list of values\n",
|
| 703 |
+
" value : value to be replaced with 1\n",
|
| 704 |
+
" \"\"\"\n",
|
| 705 |
+
" size = len(values)\n",
|
| 706 |
+
" vec = [0] * size\n",
|
| 707 |
+
" for i in range(size):\n",
|
| 708 |
+
" if (values[i] == value):\n",
|
| 709 |
+
" vec[i] = 1\n",
|
| 710 |
+
" return vec\t\t\n",
|
| 711 |
+
"\n",
|
| 712 |
+
"def createLabeledSeq(inputData, tw):\n",
|
| 713 |
+
" \"\"\"\n",
|
| 714 |
+
" Creates feature, label pair from sequence data, where we have tw number of features followed by output\n",
|
| 715 |
+
"\n",
|
| 716 |
+
" Parameters\n",
|
| 717 |
+
" values : list containing feature and label\n",
|
| 718 |
+
" tw : no of features\n",
|
| 719 |
+
" \"\"\"\n",
|
| 720 |
+
" features = list()\n",
|
| 721 |
+
" labels = list()\n",
|
| 722 |
+
" l = len(inputDta)\n",
|
| 723 |
+
" for i in range(l - tw):\n",
|
| 724 |
+
" trainSeq = inputData[i:i+tw]\n",
|
| 725 |
+
" trainLabel = inputData[i+tw]\n",
|
| 726 |
+
" features.append(trainSeq)\n",
|
| 727 |
+
" labels.append(trainLabel)\n",
|
| 728 |
+
" return (features, labels)\n",
|
| 729 |
+
"\n",
|
| 730 |
+
"def createLabeledSeq(filePath, delim, index, tw):\n",
|
| 731 |
+
" \"\"\"\n",
|
| 732 |
+
" Creates feature, label pair from 1D sequence data in file\t\n",
|
| 733 |
+
"\n",
|
| 734 |
+
" Parameters\n",
|
| 735 |
+
" filePath : file path\n",
|
| 736 |
+
" delim : delemeter\n",
|
| 737 |
+
" index : column index\n",
|
| 738 |
+
" tw : no of features\n",
|
| 739 |
+
" \"\"\"\n",
|
| 740 |
+
" seqData = getFileColumnAsFloat(filePath, delim, index)\n",
|
| 741 |
+
" return createLabeledSeq(seqData, tw)\n",
|
| 742 |
+
"\n",
|
| 743 |
+
"def fromMultDimSeqToTabular(data, inpSize, seqLen):\n",
|
| 744 |
+
" \"\"\"\n",
|
| 745 |
+
" Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)\n",
|
| 746 |
+
"\n",
|
| 747 |
+
" Parameters\n",
|
| 748 |
+
" data : 2D array\n",
|
| 749 |
+
" inpSize : each input size in sequence\n",
|
| 750 |
+
" seqLen : sequence length\n",
|
| 751 |
+
" \"\"\"\t\n",
|
| 752 |
+
" nrow = data.shape[0]\n",
|
| 753 |
+
" assert data.shape[1] == inpSize * seqLen, \"invalid input size or sequence length\"\n",
|
| 754 |
+
" return data.reshape(nrow * seqLen, inpSize)\n",
|
| 755 |
+
"\n",
|
| 756 |
+
"def fromTabularToMultDimSeq(data, inpSize, seqLen):\n",
|
| 757 |
+
" \"\"\"\n",
|
| 758 |
+
" Input shape (nrow * seqLen, inpSize) output shape (nrow, inpSize * seqLen) \n",
|
| 759 |
+
" Parameters\n",
|
| 760 |
+
" data : 2D array\n",
|
| 761 |
+
" inpSize : each input size in sequence\n",
|
| 762 |
+
" seqLen : sequence length\n",
|
| 763 |
+
" \"\"\"\t\n",
|
| 764 |
+
" nrow = int(data.shape[0] / seqLen)\n",
|
| 765 |
+
" assert data.shape[1] == inpSize, \"invalid input size\"\n",
|
| 766 |
+
" return data.reshape(nrow, seqLen * inpSize)\n",
|
| 767 |
+
"\n",
|
| 768 |
+
"def difference(data, interval=1):\n",
|
| 769 |
+
" \"\"\"\n",
|
| 770 |
+
" takes difference in time series data\n",
|
| 771 |
+
" Parameters\n",
|
| 772 |
+
" data :list data\n",
|
| 773 |
+
" interval : interval for difference\n",
|
| 774 |
+
" \"\"\"\n",
|
| 775 |
+
" diff = list()\n",
|
| 776 |
+
" for i in range(interval, len(data)):\n",
|
| 777 |
+
" value = data[i] - data[i - interval]\n",
|
| 778 |
+
" diff.append(value)\n",
|
| 779 |
+
" return diff\n",
|
| 780 |
+
"\n",
|
| 781 |
+
"def normalizeMatrix(data, norm, axis=1):\n",
|
| 782 |
+
" \"\"\"\n",
|
| 783 |
+
" normalized each row of the matrix\n",
|
| 784 |
+
"\n",
|
| 785 |
+
" Parameters\n",
|
| 786 |
+
" data : 2D data\n",
|
| 787 |
+
" nporm : normalization method\n",
|
| 788 |
+
" axis : row or column\n",
|
| 789 |
+
" \"\"\"\n",
|
| 790 |
+
" normalized = preprocessing.normalize(data,norm=norm, axis=axis)\n",
|
| 791 |
+
" return normalized\n",
|
| 792 |
+
"\n",
|
| 793 |
+
"def standardizeMatrix(data, axis=0):\n",
|
| 794 |
+
" \"\"\"\n",
|
| 795 |
+
" standardizes each column of the matrix with mean and std deviation\n",
|
| 796 |
+
" Parameters\n",
|
| 797 |
+
" data : 2D data\n",
|
| 798 |
+
" axis : row or column\n",
|
| 799 |
+
" \"\"\"\n",
|
| 800 |
+
" standardized = preprocessing.scale(data, axis=axis)\n",
|
| 801 |
+
" return standardized\n",
|
| 802 |
+
"\n",
|
| 803 |
+
"def asNumpyArray(data):\n",
|
| 804 |
+
" \"\"\"\n",
|
| 805 |
+
" converts to numpy array\n",
|
| 806 |
+
" Parameters\n",
|
| 807 |
+
" data : array\n",
|
| 808 |
+
" \"\"\"\n",
|
| 809 |
+
" return np.array(data)\n",
|
| 810 |
+
"\n",
|
| 811 |
+
"def perfMetric(metric, yActual, yPred, clabels=None):\n",
|
| 812 |
+
" \"\"\"\n",
|
| 813 |
+
" predictive model accuracy metric\n",
|
| 814 |
+
" Parameters\n",
|
| 815 |
+
" metric : accuracy metric\n",
|
| 816 |
+
" yActual : actual values array\n",
|
| 817 |
+
" yPred : predicted values array\n",
|
| 818 |
+
" clabels : class labels\n",
|
| 819 |
+
" \"\"\"\n",
|
| 820 |
+
" if metric == \"rsquare\":\n",
|
| 821 |
+
" score = metrics.r2_score(yActual, yPred)\n",
|
| 822 |
+
" elif metric == \"mae\":\n",
|
| 823 |
+
" score = metrics.mean_absolute_error(yActual, yPred)\n",
|
| 824 |
+
" elif metric == \"mse\":\n",
|
| 825 |
+
" score = metrics.mean_squared_error(yActual, yPred)\n",
|
| 826 |
+
" elif metric == \"acc\":\n",
|
| 827 |
+
" yPred = np.rint(yPred)\n",
|
| 828 |
+
" score = metrics.accuracy_score(yActual, yPred)\n",
|
| 829 |
+
" elif metric == \"mlAcc\":\n",
|
| 830 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 831 |
+
" score = metrics.accuracy_score(yActual, yPred)\n",
|
| 832 |
+
" elif metric == \"prec\":\n",
|
| 833 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 834 |
+
" score = metrics.precision_score(yActual, yPred)\n",
|
| 835 |
+
" elif metric == \"rec\":\n",
|
| 836 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 837 |
+
" score = metrics.recall_score(yActual, yPred)\n",
|
| 838 |
+
" elif metric == \"fone\":\n",
|
| 839 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 840 |
+
" score = metrics.f1_score(yActual, yPred)\n",
|
| 841 |
+
" elif metric == \"confm\":\n",
|
| 842 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 843 |
+
" score = metrics.confusion_matrix(yActual, yPred)\n",
|
| 844 |
+
" elif metric == \"clarep\":\n",
|
| 845 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 846 |
+
" score = metrics.classification_report(yActual, yPred)\n",
|
| 847 |
+
" elif metric == \"bce\":\n",
|
| 848 |
+
" if clabels is None:\n",
|
| 849 |
+
" clabels = [0, 1]\n",
|
| 850 |
+
" score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
|
| 851 |
+
" elif metric == \"ce\":\n",
|
| 852 |
+
" assert clabels is not None, \"labels must be provided\"\n",
|
| 853 |
+
" score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
|
| 854 |
+
" else:\n",
|
| 855 |
+
" exitWithMsg(\"invalid prediction performance metric \" + metric)\n",
|
| 856 |
+
" return score\n",
|
| 857 |
+
"\n",
|
| 858 |
+
"def scaleData(data, method):\n",
|
| 859 |
+
" \"\"\"\n",
|
| 860 |
+
" scales feature data column wise\n",
|
| 861 |
+
" Parameters\n",
|
| 862 |
+
" data : 2D array\n",
|
| 863 |
+
" method : scaling method\n",
|
| 864 |
+
" \"\"\"\n",
|
| 865 |
+
" if method == \"minmax\":\n",
|
| 866 |
+
" scaler = preprocessing.MinMaxScaler()\n",
|
| 867 |
+
" data = scaler.fit_transform(data)\n",
|
| 868 |
+
" elif method == \"zscale\":\n",
|
| 869 |
+
" data = preprocessing.scale(data)\t\n",
|
| 870 |
+
" else:\n",
|
| 871 |
+
" raise ValueError(\"invalid scaling method\")\t\n",
|
| 872 |
+
" return data\n",
|
| 873 |
+
"\n",
|
| 874 |
+
"def scaleDataWithParams(data, method, scParams):\n",
|
| 875 |
+
" \"\"\"\n",
|
| 876 |
+
" scales feature data column wise\n",
|
| 877 |
+
" Parameters\n",
|
| 878 |
+
" data : 2D array\n",
|
| 879 |
+
" method : scaling method\n",
|
| 880 |
+
" scParams : scaling parameters\n",
|
| 881 |
+
" \"\"\"\n",
|
| 882 |
+
" if method == \"minmax\":\n",
|
| 883 |
+
" data = scaleMinMaxTabData(data, scParams)\n",
|
| 884 |
+
" elif method == \"zscale\":\n",
|
| 885 |
+
" raise ValueError(\"invalid scaling method\")\t\n",
|
| 886 |
+
" else:\n",
|
| 887 |
+
" raise ValueError(\"invalid scaling method\")\t\n",
|
| 888 |
+
" return data\n",
|
| 889 |
+
"\n",
|
| 890 |
+
"\n",
|
| 891 |
+
"def scaleMinMaxTabData(tdata, minMax):\n",
|
| 892 |
+
" \"\"\"\n",
|
| 893 |
+
" for tabular scales feature data column wise using min max values for each field\n",
|
| 894 |
+
" Parameters\n",
|
| 895 |
+
" tdata : 2D array\n",
|
| 896 |
+
" minMax : ni, max and range for each column\n",
|
| 897 |
+
" \"\"\"\n",
|
| 898 |
+
" stdata = list()\n",
|
| 899 |
+
" for r in tdata:\n",
|
| 900 |
+
" srdata = list()\n",
|
| 901 |
+
" for i, c in enumerate(r):\n",
|
| 902 |
+
" sd = (c - minMax[i][0]) / minMax[i][2]\n",
|
| 903 |
+
" srdata.append(sd)\n",
|
| 904 |
+
" stdata.append(srdata)\n",
|
| 905 |
+
" return stdata\n",
|
| 906 |
+
"\n",
|
| 907 |
+
"def scaleMinMax(rdata, minMax):\n",
|
| 908 |
+
" \"\"\"\n",
|
| 909 |
+
" scales feature data column wise using min max values for each field\n",
|
| 910 |
+
" Parameters\n",
|
| 911 |
+
" rdata : data array\n",
|
| 912 |
+
" minMax : ni, max and range for each column\n",
|
| 913 |
+
" \"\"\"\n",
|
| 914 |
+
" srdata = list()\n",
|
| 915 |
+
" for i in range(len(rdata)):\n",
|
| 916 |
+
" d = rdata[i]\n",
|
| 917 |
+
" sd = (d - minMax[i][0]) / minMax[i][2]\n",
|
| 918 |
+
" srdata.append(sd)\n",
|
| 919 |
+
" return srdata\n",
|
| 920 |
+
"\n",
|
| 921 |
+
"def harmonicNum(n):\n",
|
| 922 |
+
" \"\"\"\n",
|
| 923 |
+
" harmonic number\n",
|
| 924 |
+
" Parameters\n",
|
| 925 |
+
" n : number\n",
|
| 926 |
+
" \"\"\"\n",
|
| 927 |
+
" h = 0\n",
|
| 928 |
+
" for i in range(1, n+1, 1):\n",
|
| 929 |
+
" h += 1.0 / i\n",
|
| 930 |
+
" return h\n",
|
| 931 |
+
"\n",
|
| 932 |
+
"def digammaFun(n):\n",
|
| 933 |
+
" \"\"\"\n",
|
| 934 |
+
" figamma function\n",
|
| 935 |
+
" Parameters\n",
|
| 936 |
+
" n : number\n",
|
| 937 |
+
" \"\"\"\n",
|
| 938 |
+
" #Euler Mascheroni constant\n",
|
| 939 |
+
" ec = 0.577216\n",
|
| 940 |
+
" return harmonicNum(n - 1) - ec\n",
|
| 941 |
+
"\n",
|
| 942 |
+
"def getDataPartitions(tdata, types, columns = None):\n",
|
| 943 |
+
" \"\"\"\n",
|
| 944 |
+
" partitions data with the given columns and random split point defined with predicates\n",
|
| 945 |
+
" Parameters\n",
|
| 946 |
+
" tdata : 2D array\n",
|
| 947 |
+
" types : data typers\n",
|
| 948 |
+
" columns : column indexes\n",
|
| 949 |
+
" \"\"\"\n",
|
| 950 |
+
" (dtypes, cvalues) = extractTypesFromString(types)\n",
|
| 951 |
+
" if columns is None:\n",
|
| 952 |
+
" ncol = len(data[0])\n",
|
| 953 |
+
" columns = list(range(ncol))\n",
|
| 954 |
+
" ncol = len(columns)\n",
|
| 955 |
+
" #print(columns)\n",
|
| 956 |
+
"\n",
|
| 957 |
+
" # partition predicates\n",
|
| 958 |
+
" partitions = None\n",
|
| 959 |
+
" for c in columns:\n",
|
| 960 |
+
" #print(c)\n",
|
| 961 |
+
" dtype = dtypes[c]\n",
|
| 962 |
+
" pred = list()\n",
|
| 963 |
+
" if dtype == \"int\" or dtype == \"float\":\n",
|
| 964 |
+
" (vmin, vmax) = getColMinMax(tdata, c)\n",
|
| 965 |
+
" r = vmax - vmin\n",
|
| 966 |
+
" rmin = vmin + .2 * r\n",
|
| 967 |
+
" rmax = vmax - .2 * r\n",
|
| 968 |
+
" sp = randomFloat(rmin, rmax)\n",
|
| 969 |
+
" if dtype == \"int\":\n",
|
| 970 |
+
" sp = int(sp)\n",
|
| 971 |
+
" else:\n",
|
| 972 |
+
" sp = \"{:.3f}\".format(sp)\n",
|
| 973 |
+
" sp = float(sp)\n",
|
| 974 |
+
" pred.append([c, \"LT\", sp])\n",
|
| 975 |
+
" pred.append([c, \"GE\", sp])\n",
|
| 976 |
+
" elif dtype == \"cat\":\n",
|
| 977 |
+
" cv = cvalues[c]\n",
|
| 978 |
+
" card = len(cv) \n",
|
| 979 |
+
" if card < 3:\n",
|
| 980 |
+
" num = 1\n",
|
| 981 |
+
" else:\n",
|
| 982 |
+
" num = randomInt(1, card - 1)\n",
|
| 983 |
+
" sp = selectRandomSubListFromList(cv, num)\n",
|
| 984 |
+
" sp = \" \".join(sp)\n",
|
| 985 |
+
" pred.append([c, \"IN\", sp])\n",
|
| 986 |
+
" pred.append([c, \"NOTIN\", sp])\n",
|
| 987 |
+
"\n",
|
| 988 |
+
" #print(pred)\n",
|
| 989 |
+
" if partitions is None:\n",
|
| 990 |
+
" partitions = pred.copy()\n",
|
| 991 |
+
" #print(\"initial\")\n",
|
| 992 |
+
" #print(partitions)\n",
|
| 993 |
+
" else:\n",
|
| 994 |
+
" #print(\"extension\")\n",
|
| 995 |
+
" tparts = list()\n",
|
| 996 |
+
" for p in partitions:\n",
|
| 997 |
+
" #print(p)\n",
|
| 998 |
+
" l1 = p.copy()\n",
|
| 999 |
+
" l1.extend(pred[0])\n",
|
| 1000 |
+
" l2 = p.copy()\n",
|
| 1001 |
+
" l2.extend(pred[1])\n",
|
| 1002 |
+
" #print(\"after extension\")\n",
|
| 1003 |
+
" #print(l1)\n",
|
| 1004 |
+
" #print(l2)\n",
|
| 1005 |
+
" tparts.append(l1)\n",
|
| 1006 |
+
" tparts.append(l2)\n",
|
| 1007 |
+
" partitions = tparts\t\n",
|
| 1008 |
+
" #print(\"extending\")\n",
|
| 1009 |
+
" #print(partitions)\n",
|
| 1010 |
+
"\n",
|
| 1011 |
+
" #for p in partitions:\n",
|
| 1012 |
+
" #print(p)\t\n",
|
| 1013 |
+
" return partitions\t\t\t\n",
|
| 1014 |
+
"\n",
|
| 1015 |
+
"def genAlmostUniformDistr(size, nswap=50):\n",
|
| 1016 |
+
" \"\"\"\n",
|
| 1017 |
+
" generate probability distribution\n",
|
| 1018 |
+
"\n",
|
| 1019 |
+
" Parameters\n",
|
| 1020 |
+
" size : distr size\n",
|
| 1021 |
+
" nswap : no of mass swaps\n",
|
| 1022 |
+
" \"\"\"\n",
|
| 1023 |
+
" un = 1.0 / size\n",
|
| 1024 |
+
" distr = [un] * size\n",
|
| 1025 |
+
" distr = mutDistr(distr, 0.1 * un, nswap)\n",
|
| 1026 |
+
" return distr\n",
|
| 1027 |
+
"\n",
|
| 1028 |
+
"def mutDistr(distr, shift, nswap=50):\n",
|
| 1029 |
+
" \"\"\"\n",
|
| 1030 |
+
" mutates a probability distribution\n",
|
| 1031 |
+
"\n",
|
| 1032 |
+
" Parameters\n",
|
| 1033 |
+
" distr distribution\n",
|
| 1034 |
+
" shift : amount of shift for swap\n",
|
| 1035 |
+
" nswap : no of mass swaps\n",
|
| 1036 |
+
" \"\"\"\n",
|
| 1037 |
+
" size = len(distr)\n",
|
| 1038 |
+
" for _ in range(nswap):\n",
|
| 1039 |
+
" fi = randomInt(0, size -1)\n",
|
| 1040 |
+
" si = randomInt(0, size -1)\n",
|
| 1041 |
+
" while fi == si:\n",
|
| 1042 |
+
" fi = randomInt(0, size -1)\n",
|
| 1043 |
+
" si = randomInt(0, size -1)\n",
|
| 1044 |
+
"\n",
|
| 1045 |
+
" shift = randomFloat(0, shift)\n",
|
| 1046 |
+
" t = distr[fi]\n",
|
| 1047 |
+
" distr[fi] -= shift\n",
|
| 1048 |
+
" if (distr[fi] < 0):\n",
|
| 1049 |
+
" distr[fi] = 0.0\n",
|
| 1050 |
+
" shift = t\n",
|
| 1051 |
+
" distr[si] += shift\n",
|
| 1052 |
+
" return distr\n",
|
| 1053 |
+
"\n",
|
| 1054 |
+
"def generateBinDistribution(size, ntrue):\n",
|
| 1055 |
+
" \"\"\"\n",
|
| 1056 |
+
" generate binary array with some elements set to 1\n",
|
| 1057 |
+
"\n",
|
| 1058 |
+
" Parameters\n",
|
| 1059 |
+
" size : distr size\n",
|
| 1060 |
+
" ntrue : no of true values\n",
|
| 1061 |
+
" \"\"\"\n",
|
| 1062 |
+
" distr = [0] * size\n",
|
| 1063 |
+
" idxs = selectRandomSubListFromList(list(range(size)), ntrue)\n",
|
| 1064 |
+
" for i in idxs:\n",
|
| 1065 |
+
" distr[i] = 1\n",
|
| 1066 |
+
" return distr\n",
|
| 1067 |
+
"\n",
|
| 1068 |
+
"def mutBinaryDistr(distr, nmut):\n",
|
| 1069 |
+
" \"\"\"\n",
|
| 1070 |
+
" mutate binary distribution\n",
|
| 1071 |
+
"\n",
|
| 1072 |
+
" Parameters\n",
|
| 1073 |
+
" distr : distr\n",
|
| 1074 |
+
" nmut : no of mutations\n",
|
| 1075 |
+
" \"\"\"\n",
|
| 1076 |
+
" idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)\n",
|
| 1077 |
+
" for i in idxs:\n",
|
| 1078 |
+
" distr[i] = distr[i] ^ 1\n",
|
| 1079 |
+
"\n",
|
| 1080 |
+
"\n",
|
| 1081 |
+
"def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=\",\"):\n",
|
| 1082 |
+
" \"\"\"\n",
|
| 1083 |
+
" file record generator that superimposes given data in the specified segment of a column\n",
|
| 1084 |
+
" Parameters\n",
|
| 1085 |
+
" filePath ; file path\n",
|
| 1086 |
+
" column : column index \n",
|
| 1087 |
+
" offset : offset into column values\n",
|
| 1088 |
+
" seqLen : length of subseq\n",
|
| 1089 |
+
" modifier : data to be superimposed either list or a sampler object\n",
|
| 1090 |
+
" precision : floating point precision\n",
|
| 1091 |
+
" delim : delemeter\n",
|
| 1092 |
+
" \"\"\"\n",
|
| 1093 |
+
" beg = offset\n",
|
| 1094 |
+
" end = beg + seqLen\n",
|
| 1095 |
+
" isList = type(modifier) == list\n",
|
| 1096 |
+
" i = 0\n",
|
| 1097 |
+
" for rec in fileRecGen(filePath, delim):\n",
|
| 1098 |
+
" if i >= beg and i < end:\n",
|
| 1099 |
+
" va = float(rec[column])\n",
|
| 1100 |
+
" if isList:\n",
|
| 1101 |
+
" va += modifier[i - beg] \n",
|
| 1102 |
+
" else:\n",
|
| 1103 |
+
" va += modifier.sample()\n",
|
| 1104 |
+
" rec[column] = formatFloat(precision, va)\n",
|
| 1105 |
+
" yield delim.join(rec)\n",
|
| 1106 |
+
" i += 1\n",
|
| 1107 |
+
"\n",
|
| 1108 |
+
"class ShiftedDataGenerator:\n",
|
| 1109 |
+
" \"\"\"\n",
|
| 1110 |
+
" transforms data for distribution shift\n",
|
| 1111 |
+
" \"\"\"\n",
|
| 1112 |
+
" def __init__(self, types, tdata, addFact, multFact):\n",
|
| 1113 |
+
" \"\"\"\n",
|
| 1114 |
+
" initializer\n",
|
| 1115 |
+
"\n",
|
| 1116 |
+
" Parameters\n",
|
| 1117 |
+
" types data types\n",
|
| 1118 |
+
" tdata : 2D array\n",
|
| 1119 |
+
" addFact ; factor for data shift\n",
|
| 1120 |
+
" multFact ; factor for data scaling\n",
|
| 1121 |
+
" \"\"\"\n",
|
| 1122 |
+
" (self.dtypes, self.cvalues) = extractTypesFromString(types)\n",
|
| 1123 |
+
"\n",
|
| 1124 |
+
" self.limits = dict()\n",
|
| 1125 |
+
" for k,v in self.dtypes.items():\n",
|
| 1126 |
+
" if v == \"int\" or v == \"false\":\n",
|
| 1127 |
+
" (vmax, vmin) = getColMinMax(tdata, k)\n",
|
| 1128 |
+
" self.limits[k] = vmax - vmin\n",
|
| 1129 |
+
" self.addMin = - addFact / 2\n",
|
| 1130 |
+
" self.addMax = addFact / 2\n",
|
| 1131 |
+
" self.multMin = 1.0 - multFact / 2\n",
|
| 1132 |
+
" self.multMax = 1.0 + multFact / 2\n",
|
| 1133 |
+
"\n",
|
| 1134 |
+
"\n",
|
| 1135 |
+
"\n",
|
| 1136 |
+
"\n",
|
| 1137 |
+
" def transform(self, tdata):\n",
|
| 1138 |
+
" \"\"\"\n",
|
| 1139 |
+
" linear transforms data to create distribution shift with random shift and scale\n",
|
| 1140 |
+
" Parameters\n",
|
| 1141 |
+
" types : data types\n",
|
| 1142 |
+
" \"\"\"\n",
|
| 1143 |
+
" transforms = dict()\n",
|
| 1144 |
+
" for k,v in self.dtypes.items():\n",
|
| 1145 |
+
" if v == \"int\" or v == \"false\":\t\t\t\t\n",
|
| 1146 |
+
" shift = randomFloat(self.addMin, self.addMax) * self.limits[k] \n",
|
| 1147 |
+
" scale = randomFloat(self.multMin, self.multMax)\n",
|
| 1148 |
+
" trns = (shift, scale)\n",
|
| 1149 |
+
" transforms[k] = trns\n",
|
| 1150 |
+
" elif v == \"cat\":\n",
|
| 1151 |
+
" transforms[k] = isEventSampled(50)\n",
|
| 1152 |
+
"\n",
|
| 1153 |
+
" ttdata = list()\n",
|
| 1154 |
+
" for rec in tdata:\n",
|
| 1155 |
+
" nrec = rec.copy()\n",
|
| 1156 |
+
" for c in range(len(rec)):\n",
|
| 1157 |
+
" if c in self.dtypes:\n",
|
| 1158 |
+
" dtype = self.dtypes[c]\n",
|
| 1159 |
+
" if dtype == \"int\" or dtype == \"float\":\n",
|
| 1160 |
+
" (shift, scale) = transforms[c]\n",
|
| 1161 |
+
" nval = shift + rec[c] * scale\n",
|
| 1162 |
+
" if dtype == \"int\":\n",
|
| 1163 |
+
" nrec[c] = int(nval)\n",
|
| 1164 |
+
" else:\n",
|
| 1165 |
+
" nrec[c] = nval\n",
|
| 1166 |
+
" elif dtype == \"cat\":\n",
|
| 1167 |
+
" cv = self.cvalues[c]\n",
|
| 1168 |
+
" if transforms[c]:\n",
|
| 1169 |
+
" nval = selectOtherRandomFromList(cv, rec[c])\n",
|
| 1170 |
+
" nrec[c] = nval\n",
|
| 1171 |
+
"\n",
|
| 1172 |
+
" ttdata.append(nrec)\n",
|
| 1173 |
+
"\n",
|
| 1174 |
+
" return ttdata\n",
|
| 1175 |
+
"\n",
|
| 1176 |
+
" def transformSpecified(self, tdata, sshift, scale):\n",
|
| 1177 |
+
" \"\"\"\n",
|
| 1178 |
+
" linear transforms data to create distribution shift shift specified shift and scale\n",
|
| 1179 |
+
" Parameters\n",
|
| 1180 |
+
" types : data types\n",
|
| 1181 |
+
" sshift : shift factor\n",
|
| 1182 |
+
" scale : scale factor\n",
|
| 1183 |
+
" \"\"\"\n",
|
| 1184 |
+
" transforms = dict()\n",
|
| 1185 |
+
" for k,v in self.dtypes.items():\n",
|
| 1186 |
+
" if v == \"int\" or v == \"false\":\t\t\t\t\n",
|
| 1187 |
+
" shift = sshift * self.limits[k] \n",
|
| 1188 |
+
" trns = (shift, scale)\n",
|
| 1189 |
+
" transforms[k] = trns\n",
|
| 1190 |
+
" elif v == \"cat\":\n",
|
| 1191 |
+
" transforms[k] = isEventSampled(50)\n",
|
| 1192 |
+
"\n",
|
| 1193 |
+
" ttdata = self.__scaleShift(tdata, transforms)\n",
|
| 1194 |
+
" return ttdata\n",
|
| 1195 |
+
"\n",
|
| 1196 |
+
" def __scaleShift(self, tdata, transforms):\n",
|
| 1197 |
+
" \"\"\"\n",
|
| 1198 |
+
" shifts and scales tabular data\n",
|
| 1199 |
+
"\n",
|
| 1200 |
+
" Parameters\n",
|
| 1201 |
+
" tdata : 2D array\n",
|
| 1202 |
+
" transforms : transforms to apply\n",
|
| 1203 |
+
" \"\"\"\n",
|
| 1204 |
+
" ttdata = list()\n",
|
| 1205 |
+
" for rec in tdata:\n",
|
| 1206 |
+
" nrec = rec.copy()\n",
|
| 1207 |
+
" for c in range(len(rec)):\n",
|
| 1208 |
+
" if c in self.dtypes:\n",
|
| 1209 |
+
" dtype = self.dtypes[c]\n",
|
| 1210 |
+
" if dtype == \"int\" or dtype == \"float\":\n",
|
| 1211 |
+
" (shift, scale) = transforms[c]\n",
|
| 1212 |
+
" nval = shift + rec[c] * scale\n",
|
| 1213 |
+
" if dtype == \"int\":\n",
|
| 1214 |
+
" nrec[c] = int(nval)\n",
|
| 1215 |
+
" else:\n",
|
| 1216 |
+
" nrec[c] = nval\n",
|
| 1217 |
+
" elif dtype == \"cat\":\n",
|
| 1218 |
+
" cv = self.cvalues[c]\n",
|
| 1219 |
+
" if transforms[c]:\n",
|
| 1220 |
+
" #nval = selectOtherRandomFromList(cv, rec[c])\n",
|
| 1221 |
+
" #nrec[c] = nval\n",
|
| 1222 |
+
" pass\n",
|
| 1223 |
+
"\n",
|
| 1224 |
+
" ttdata.append(nrec)\n",
|
| 1225 |
+
" return ttdata\n",
|
| 1226 |
+
"\n",
|
| 1227 |
+
"class RollingStat(object):\n",
|
| 1228 |
+
" \"\"\"\n",
|
| 1229 |
+
" stats for rolling windowt\n",
|
| 1230 |
+
" \"\"\"\n",
|
| 1231 |
+
" def __init__(self, wsize):\n",
|
| 1232 |
+
" \"\"\"\n",
|
| 1233 |
+
" initializer\n",
|
| 1234 |
+
"\n",
|
| 1235 |
+
" Parameters\n",
|
| 1236 |
+
" wsize : window size\n",
|
| 1237 |
+
" \"\"\"\n",
|
| 1238 |
+
" self.window = list()\n",
|
| 1239 |
+
" self.wsize = wsize\n",
|
| 1240 |
+
" self.mean = None\n",
|
| 1241 |
+
" self.sd = None\n",
|
| 1242 |
+
"\n",
|
| 1243 |
+
" def add(self, value):\n",
|
| 1244 |
+
" \"\"\"\n",
|
| 1245 |
+
" add a value\n",
|
| 1246 |
+
"\n",
|
| 1247 |
+
" Parameters\n",
|
| 1248 |
+
" value : value to add\n",
|
| 1249 |
+
" \"\"\"\n",
|
| 1250 |
+
" self.window.append(value)\n",
|
| 1251 |
+
" if len(self.window) > self.wsize:\n",
|
| 1252 |
+
" self.window = self.window[1:]\n",
|
| 1253 |
+
"\n",
|
| 1254 |
+
" def getStat(self):\n",
|
| 1255 |
+
" \"\"\"\n",
|
| 1256 |
+
" get rolling window mean and std deviation\n",
|
| 1257 |
+
" \"\"\"\n",
|
| 1258 |
+
" assertGreater(len(self.window), 0, \"window is empty\")\n",
|
| 1259 |
+
" if len(self.window) == 1:\n",
|
| 1260 |
+
" self.mean = self.window[0]\n",
|
| 1261 |
+
" self.sd = 0\n",
|
| 1262 |
+
" else:\n",
|
| 1263 |
+
" self.mean = statistics.mean(self.window)\n",
|
| 1264 |
+
" self.sd = statistics.stdev(self.window, xbar=self.mean)\n",
|
| 1265 |
+
" re = (self.mean, self.sd)\n",
|
| 1266 |
+
" return re\n",
|
| 1267 |
+
"\n",
|
| 1268 |
+
" def getSize(self):\n",
|
| 1269 |
+
" \"\"\"\n",
|
| 1270 |
+
" return window size\n",
|
| 1271 |
+
" \"\"\"\n",
|
| 1272 |
+
" return len(self.window)\n"
|
| 1273 |
+
]
|
| 1274 |
+
}
|
| 1275 |
+
],
|
| 1276 |
+
"metadata": {
|
| 1277 |
+
"kernelspec": {
|
| 1278 |
+
"display_name": "Python 3 (ipykernel)",
|
| 1279 |
+
"language": "python",
|
| 1280 |
+
"name": "python3"
|
| 1281 |
+
},
|
| 1282 |
+
"language_info": {
|
| 1283 |
+
"codemirror_mode": {
|
| 1284 |
+
"name": "ipython",
|
| 1285 |
+
"version": 3
|
| 1286 |
+
},
|
| 1287 |
+
"file_extension": ".py",
|
| 1288 |
+
"mimetype": "text/x-python",
|
| 1289 |
+
"name": "python",
|
| 1290 |
+
"nbconvert_exporter": "python",
|
| 1291 |
+
"pygments_lexer": "ipython3",
|
| 1292 |
+
"version": "3.9.12"
|
| 1293 |
+
}
|
| 1294 |
+
},
|
| 1295 |
+
"nbformat": 4,
|
| 1296 |
+
"nbformat_minor": 5
|
| 1297 |
+
}
|
lib/sampler.ipynb
ADDED
|
@@ -0,0 +1,1366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "c19a2efe",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import sys\n",
|
| 11 |
+
"import random \n",
|
| 12 |
+
"import time\n",
|
| 13 |
+
"import math\n",
|
| 14 |
+
"import random\n",
|
| 15 |
+
"import numpy as np\n",
|
| 16 |
+
"from scipy import stats\n",
|
| 17 |
+
"from random import randint\n",
|
| 18 |
+
"from util import *\n",
|
| 19 |
+
"from stats import Histogram\n",
|
| 20 |
+
"\n",
|
| 21 |
+
"def randomFloat(low, high):\n",
|
| 22 |
+
" \"\"\"\n",
|
| 23 |
+
" sample float within range\n",
|
| 24 |
+
" Parameters\n",
|
| 25 |
+
" low : low valuee\n",
|
| 26 |
+
" high : high valuee\n",
|
| 27 |
+
" \"\"\"\n",
|
| 28 |
+
" return random.random() * (high-low) + low\n",
|
| 29 |
+
"\n",
|
| 30 |
+
"def randomInt(minv, maxv):\n",
|
| 31 |
+
" \"\"\"\n",
|
| 32 |
+
" sample int within range\n",
|
| 33 |
+
" Parameters\n",
|
| 34 |
+
" minv : low valuee\n",
|
| 35 |
+
" maxv : high valuee\n",
|
| 36 |
+
" \"\"\"\n",
|
| 37 |
+
" return randint(minv, maxv)\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"def randIndex(lData):\n",
|
| 40 |
+
" \"\"\"\n",
|
| 41 |
+
" random index of a list\n",
|
| 42 |
+
" Parameters\n",
|
| 43 |
+
" lData : list data\n",
|
| 44 |
+
" \"\"\"\n",
|
| 45 |
+
" return randint(0, len(lData)-1)\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"def randomUniformSampled(low, high):\n",
|
| 48 |
+
" \"\"\"\n",
|
| 49 |
+
" sample float within range\n",
|
| 50 |
+
"\n",
|
| 51 |
+
" Parameters\n",
|
| 52 |
+
" low : low value\n",
|
| 53 |
+
" high : high value\n",
|
| 54 |
+
" \"\"\"\n",
|
| 55 |
+
" return np.random.uniform(low, high)\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"def randomUniformSampledList(low, high, size):\n",
|
| 58 |
+
" \"\"\"\n",
|
| 59 |
+
" sample floats within range to create list\n",
|
| 60 |
+
" Parameters\n",
|
| 61 |
+
" low : low value\n",
|
| 62 |
+
" high : high value\n",
|
| 63 |
+
" size ; size of list to be returned\n",
|
| 64 |
+
" \"\"\"\n",
|
| 65 |
+
" return np.random.uniform(low, high, size)\n",
|
| 66 |
+
"\n",
|
| 67 |
+
"def randomNormSampled(mean, sd):\n",
|
| 68 |
+
" \"\"\"\n",
|
| 69 |
+
" sample float from normal\n",
|
| 70 |
+
" Parameters\n",
|
| 71 |
+
" mean : mean\n",
|
| 72 |
+
" sd : std deviation\n",
|
| 73 |
+
" \"\"\"\n",
|
| 74 |
+
" return np.random.normal(mean, sd)\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"def randomNormSampledList(mean, sd, size):\n",
|
| 77 |
+
" \"\"\"\n",
|
| 78 |
+
" sample float list from normal \n",
|
| 79 |
+
" Parameters\n",
|
| 80 |
+
" mean : mean\n",
|
| 81 |
+
" sd : std deviation\n",
|
| 82 |
+
" size : size of list to be returned\n",
|
| 83 |
+
" \"\"\"\n",
|
| 84 |
+
" return np.random.normal(mean, sd, size)\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"def randomSampledList(sampler, size):\n",
|
| 87 |
+
" \"\"\"\n",
|
| 88 |
+
" sample list from given sampler \n",
|
| 89 |
+
" Parameters\n",
|
| 90 |
+
" sampler : sampler object\n",
|
| 91 |
+
" size : size of list to be returned\n",
|
| 92 |
+
" \"\"\"\n",
|
| 93 |
+
" return list(map(lambda i : sampler.sample(), range(size)))\n",
|
| 94 |
+
"\n",
|
| 95 |
+
"\n",
|
| 96 |
+
"def minLimit(val, minv):\n",
|
| 97 |
+
" \"\"\"\n",
|
| 98 |
+
" min limit\n",
|
| 99 |
+
"\n",
|
| 100 |
+
" Parameters\n",
|
| 101 |
+
" val : value\n",
|
| 102 |
+
" minv : min limit\n",
|
| 103 |
+
" \"\"\"\n",
|
| 104 |
+
" if (val < minv):\n",
|
| 105 |
+
" val = minv\n",
|
| 106 |
+
" return val\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"def rangeLimit(val, minv, maxv):\n",
|
| 110 |
+
" \"\"\"\n",
|
| 111 |
+
" range limit\n",
|
| 112 |
+
" Parameters\n",
|
| 113 |
+
" val : value\n",
|
| 114 |
+
" minv : min limit\n",
|
| 115 |
+
" maxv : max limit\n",
|
| 116 |
+
" \"\"\"\n",
|
| 117 |
+
" if (val < minv):\n",
|
| 118 |
+
" val = minv\n",
|
| 119 |
+
" elif (val > maxv):\n",
|
| 120 |
+
" val = maxv\n",
|
| 121 |
+
" return val\n",
|
| 122 |
+
"\n",
|
| 123 |
+
"\n",
|
| 124 |
+
"def sampleUniform(minv, maxv):\n",
|
| 125 |
+
" \"\"\"\n",
|
| 126 |
+
" sample int within range\n",
|
| 127 |
+
" Parameters\n",
|
| 128 |
+
" minv ; int min limit\n",
|
| 129 |
+
" maxv : int max limit\n",
|
| 130 |
+
" \"\"\"\n",
|
| 131 |
+
" return randint(minv, maxv)\n",
|
| 132 |
+
"\n",
|
| 133 |
+
"\n",
|
| 134 |
+
"def sampleFromBase(value, dev):\n",
|
| 135 |
+
" \"\"\"\n",
|
| 136 |
+
" sample int wrt base\n",
|
| 137 |
+
" Parameters\n",
|
| 138 |
+
" value : base value\n",
|
| 139 |
+
" dev : deviation\n",
|
| 140 |
+
" \"\"\"\n",
|
| 141 |
+
" return randint(value - dev, value + dev)\n",
|
| 142 |
+
"\n",
|
| 143 |
+
"\n",
|
| 144 |
+
"def sampleFloatFromBase(value, dev):\n",
|
| 145 |
+
" \"\"\"\n",
|
| 146 |
+
" sample float wrt base\n",
|
| 147 |
+
" Parameters\n",
|
| 148 |
+
" value : base value\n",
|
| 149 |
+
" dev : deviation\n",
|
| 150 |
+
" \"\"\"\n",
|
| 151 |
+
" return randomFloat(value - dev, value + dev)\n",
|
| 152 |
+
"\n",
|
| 153 |
+
"\n",
|
| 154 |
+
"def distrUniformWithRanndom(total, numItems, noiseLevel):\n",
|
| 155 |
+
" \"\"\"\n",
|
| 156 |
+
" uniformly distribute with some randomness and preserves total\n",
|
| 157 |
+
" Parameters\n",
|
| 158 |
+
" total : total count\n",
|
| 159 |
+
" numItems : no of bins\n",
|
| 160 |
+
" noiseLevel : noise level fraction\n",
|
| 161 |
+
" \"\"\"\n",
|
| 162 |
+
" perItem = total / numItems\n",
|
| 163 |
+
" var = perItem * noiseLevel\n",
|
| 164 |
+
" items = []\n",
|
| 165 |
+
" for i in range(numItems):\n",
|
| 166 |
+
" item = perItem + randomFloat(-var, var)\n",
|
| 167 |
+
" items.append(item)\t\n",
|
| 168 |
+
"\n",
|
| 169 |
+
" #adjust last item\n",
|
| 170 |
+
" sm = sum(items[:-1])\n",
|
| 171 |
+
" items[-1] = total - sm\n",
|
| 172 |
+
" return items\n",
|
| 173 |
+
"\n",
|
| 174 |
+
"\n",
|
| 175 |
+
"def isEventSampled(threshold, maxv=100):\n",
|
| 176 |
+
" \"\"\"\n",
|
| 177 |
+
" sample event which occurs if sampled below threshold\n",
|
| 178 |
+
" Parameters\n",
|
| 179 |
+
" threshold : threshold for sampling\n",
|
| 180 |
+
" maxv : maximum values\n",
|
| 181 |
+
" \"\"\"\n",
|
| 182 |
+
" return randint(0, maxv) < threshold\n",
|
| 183 |
+
"\n",
|
| 184 |
+
"\n",
|
| 185 |
+
"def sampleBinaryEvents(events, probPercent):\n",
|
| 186 |
+
" \"\"\"\n",
|
| 187 |
+
" sample binary events\n",
|
| 188 |
+
" Parameters\n",
|
| 189 |
+
" events : two events\n",
|
| 190 |
+
" probPercent : probability as percentage\n",
|
| 191 |
+
" \"\"\"\n",
|
| 192 |
+
" if (randint(0, 100) < probPercent):\n",
|
| 193 |
+
" event = events[0]\n",
|
| 194 |
+
" else:\n",
|
| 195 |
+
" event = events[1]\n",
|
| 196 |
+
" return event\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"\n",
|
| 199 |
+
"def addNoiseNum(value, sampler):\n",
|
| 200 |
+
" \"\"\"\n",
|
| 201 |
+
" add noise to numeric value\n",
|
| 202 |
+
" Parameters\n",
|
| 203 |
+
" value : base value\n",
|
| 204 |
+
" sampler : sampler for noise\n",
|
| 205 |
+
" \"\"\"\n",
|
| 206 |
+
" return value * (1 + sampler.sample())\n",
|
| 207 |
+
"\n",
|
| 208 |
+
"\n",
|
| 209 |
+
"def addNoiseCat(value, values, noise):\t\n",
|
| 210 |
+
" \"\"\"\n",
|
| 211 |
+
" add noise to categorical value i.e with some probability change value\n",
|
| 212 |
+
" Parameters\n",
|
| 213 |
+
" value : cat value\n",
|
| 214 |
+
" values : cat values\n",
|
| 215 |
+
" noise : noise level fraction\n",
|
| 216 |
+
" \"\"\"\n",
|
| 217 |
+
" newValue = value\n",
|
| 218 |
+
" threshold = int(noise * 100)\n",
|
| 219 |
+
" if (isEventSampled(threshold)):\t\t\n",
|
| 220 |
+
" newValue = selectRandomFromList(values)\n",
|
| 221 |
+
" while newValue == value:\n",
|
| 222 |
+
" newValue = selectRandomFromList(values)\n",
|
| 223 |
+
" return newValue\n",
|
| 224 |
+
"\n",
|
| 225 |
+
"\n",
|
| 226 |
+
"def sampleWithReplace(data, sampSize):\n",
|
| 227 |
+
" \"\"\"\n",
|
| 228 |
+
" sample with replacement\n",
|
| 229 |
+
" Parameters\n",
|
| 230 |
+
" data : array\n",
|
| 231 |
+
" sampSize : sample size\n",
|
| 232 |
+
" \"\"\"\n",
|
| 233 |
+
" sampled = list()\n",
|
| 234 |
+
" le = len(data)\n",
|
| 235 |
+
" if sampSize is None:\n",
|
| 236 |
+
" sampSize = le\n",
|
| 237 |
+
" for i in range(sampSize):\n",
|
| 238 |
+
" j = random.randint(0, le - 1)\n",
|
| 239 |
+
" sampled.append(data[j])\n",
|
| 240 |
+
" return sampled\n",
|
| 241 |
+
"\n",
|
| 242 |
+
"class CumDistr:\n",
|
| 243 |
+
" \"\"\"\n",
|
| 244 |
+
" cumulative distr\n",
|
| 245 |
+
" \"\"\"\n",
|
| 246 |
+
"\n",
|
| 247 |
+
" def __init__(self, data, numBins = None):\n",
|
| 248 |
+
" \"\"\"\n",
|
| 249 |
+
" initializer\n",
|
| 250 |
+
"\n",
|
| 251 |
+
" Parameters\n",
|
| 252 |
+
" data : array\n",
|
| 253 |
+
" numBins : no of bins\n",
|
| 254 |
+
" \"\"\"\n",
|
| 255 |
+
" if not numBins:\n",
|
| 256 |
+
" numBins = int(len(data) / 5)\n",
|
| 257 |
+
" res = stats.cumfreq(data, numbins=numBins)\n",
|
| 258 |
+
" self.cdistr = res.cumcount / len(data)\n",
|
| 259 |
+
" self.loLim = res.lowerlimit\n",
|
| 260 |
+
" self.upLim = res.lowerlimit + res.binsize * res.cumcount.size\n",
|
| 261 |
+
" self.binWidth = res.binsize\n",
|
| 262 |
+
"\n",
|
| 263 |
+
" def getDistr(self, value):\n",
|
| 264 |
+
" \"\"\"\n",
|
| 265 |
+
" get cumulative distribution\n",
|
| 266 |
+
"\n",
|
| 267 |
+
" Parameters\n",
|
| 268 |
+
" value : value\n",
|
| 269 |
+
" \"\"\"\n",
|
| 270 |
+
" if value <= self.loLim:\n",
|
| 271 |
+
" d = 0.0\n",
|
| 272 |
+
" elif value >= self.upLim:\n",
|
| 273 |
+
" d = 1.0\n",
|
| 274 |
+
" else:\n",
|
| 275 |
+
" bin = int((value - self.loLim) / self.binWidth)\n",
|
| 276 |
+
" d = self.cdistr[bin]\n",
|
| 277 |
+
" return d\n",
|
| 278 |
+
"\n",
|
| 279 |
+
"class BernoulliTrialSampler:\n",
|
| 280 |
+
" \"\"\"\n",
|
| 281 |
+
" bernoulli trial sampler return True or False\n",
|
| 282 |
+
" \"\"\"\n",
|
| 283 |
+
"\n",
|
| 284 |
+
" def __init__(self, pr):\n",
|
| 285 |
+
" \"\"\"\n",
|
| 286 |
+
" initializer\n",
|
| 287 |
+
"\n",
|
| 288 |
+
" Parameters\n",
|
| 289 |
+
" pr : probability\n",
|
| 290 |
+
" \"\"\"\n",
|
| 291 |
+
" self.pr = pr\n",
|
| 292 |
+
"\n",
|
| 293 |
+
" def sample(self):\n",
|
| 294 |
+
" \"\"\"\n",
|
| 295 |
+
" samples value\n",
|
| 296 |
+
" \"\"\"\n",
|
| 297 |
+
" return random.random() < self.pr\n",
|
| 298 |
+
"\n",
|
| 299 |
+
"class PoissonSampler:\n",
|
| 300 |
+
" \"\"\"\n",
|
| 301 |
+
" poisson sampler returns number of events\n",
|
| 302 |
+
" \"\"\"\n",
|
| 303 |
+
" def __init__(self, rateOccur, maxSamp):\n",
|
| 304 |
+
" \"\"\"\n",
|
| 305 |
+
" initializer\n",
|
| 306 |
+
"\n",
|
| 307 |
+
" Parameters\n",
|
| 308 |
+
" rateOccur : rate of occurence\n",
|
| 309 |
+
" maxSamp : max limit on no of samples\n",
|
| 310 |
+
" \"\"\"\n",
|
| 311 |
+
" self.rateOccur = rateOccur\n",
|
| 312 |
+
" self.maxSamp = int(maxSamp)\n",
|
| 313 |
+
" self.pmax = self.calculatePr(rateOccur)\n",
|
| 314 |
+
"\n",
|
| 315 |
+
" def calculatePr(self, numOccur):\n",
|
| 316 |
+
" \"\"\"\n",
|
| 317 |
+
" calulates probability\n",
|
| 318 |
+
"\n",
|
| 319 |
+
" Parameters\n",
|
| 320 |
+
" numOccur : no of occurence\n",
|
| 321 |
+
" \"\"\"\n",
|
| 322 |
+
" p = (self.rateOccur ** numOccur) * math.exp(-self.rateOccur) / math.factorial(numOccur)\n",
|
| 323 |
+
" return p\n",
|
| 324 |
+
"\n",
|
| 325 |
+
" def sample(self):\n",
|
| 326 |
+
" \"\"\"\n",
|
| 327 |
+
" samples value\n",
|
| 328 |
+
" \"\"\"\n",
|
| 329 |
+
" done = False\n",
|
| 330 |
+
" samp = 0\n",
|
| 331 |
+
" while not done:\n",
|
| 332 |
+
" no = randint(0, self.maxSamp)\n",
|
| 333 |
+
" sp = randomFloat(0.0, self.pmax)\n",
|
| 334 |
+
" ap = self.calculatePr(no)\n",
|
| 335 |
+
" if sp < ap:\n",
|
| 336 |
+
" done = True\n",
|
| 337 |
+
" samp = no\n",
|
| 338 |
+
" return samp\n",
|
| 339 |
+
"\n",
|
| 340 |
+
"class ExponentialSampler:\n",
|
| 341 |
+
" \"\"\"\n",
|
| 342 |
+
" returns interval between events\n",
|
| 343 |
+
" \"\"\"\n",
|
| 344 |
+
" def __init__(self, rateOccur, maxSamp = None):\n",
|
| 345 |
+
" \"\"\"\n",
|
| 346 |
+
" initializer\n",
|
| 347 |
+
"\n",
|
| 348 |
+
" Parameters\n",
|
| 349 |
+
" rateOccur : rate of occurence\n",
|
| 350 |
+
" maxSamp : max limit on interval\n",
|
| 351 |
+
" \"\"\"\n",
|
| 352 |
+
" self.interval = 1.0 / rateOccur\n",
|
| 353 |
+
" self.maxSamp = int(maxSamp) if maxSamp is not None else None\n",
|
| 354 |
+
"\n",
|
| 355 |
+
" def sample(self):\n",
|
| 356 |
+
" \"\"\"\n",
|
| 357 |
+
" samples value\n",
|
| 358 |
+
" \"\"\"\n",
|
| 359 |
+
" sampled = np.random.exponential(scale=self.interval)\n",
|
| 360 |
+
" if self.maxSamp is not None:\n",
|
| 361 |
+
" while sampled > self.maxSamp:\n",
|
| 362 |
+
" sampled = np.random.exponential(scale=self.interval)\n",
|
| 363 |
+
" return sampled\n",
|
| 364 |
+
"\n",
|
| 365 |
+
"class UniformNumericSampler:\n",
|
| 366 |
+
" \"\"\"\n",
|
| 367 |
+
" uniform sampler for numerical values\n",
|
| 368 |
+
" \"\"\"\n",
|
| 369 |
+
" def __init__(self, minv, maxv):\n",
|
| 370 |
+
" \"\"\"\n",
|
| 371 |
+
" initializer\n",
|
| 372 |
+
"\n",
|
| 373 |
+
" Parameters\n",
|
| 374 |
+
" minv : min value\n",
|
| 375 |
+
" maxv : max value\n",
|
| 376 |
+
" \"\"\"\n",
|
| 377 |
+
" self.minv = minv\n",
|
| 378 |
+
" self.maxv = maxv\n",
|
| 379 |
+
"\n",
|
| 380 |
+
" def isNumeric(self):\n",
|
| 381 |
+
" \"\"\"\n",
|
| 382 |
+
" returns true\n",
|
| 383 |
+
" \"\"\"\n",
|
| 384 |
+
" return True\n",
|
| 385 |
+
"\n",
|
| 386 |
+
" def sample(self):\n",
|
| 387 |
+
" \"\"\"\n",
|
| 388 |
+
" samples value\n",
|
| 389 |
+
" \"\"\"\n",
|
| 390 |
+
" samp =\tsampleUniform(self.minv, self.maxv) if isinstance(self.minv, int) else randomFloat(self.minv, self.maxv)\n",
|
| 391 |
+
" return samp\t\n",
|
| 392 |
+
"\n",
|
| 393 |
+
"class UniformCategoricalSampler:\n",
|
| 394 |
+
" \"\"\"\n",
|
| 395 |
+
" uniform sampler for categorical values\n",
|
| 396 |
+
" \"\"\"\n",
|
| 397 |
+
" def __init__(self, cvalues):\n",
|
| 398 |
+
" \"\"\"\n",
|
| 399 |
+
" initializer\n",
|
| 400 |
+
"\n",
|
| 401 |
+
" Parameters\n",
|
| 402 |
+
" cvalues : categorical value list\n",
|
| 403 |
+
" \"\"\"\n",
|
| 404 |
+
" self.cvalues = cvalues\n",
|
| 405 |
+
"\n",
|
| 406 |
+
" def isNumeric(self):\n",
|
| 407 |
+
" return False\n",
|
| 408 |
+
"\n",
|
| 409 |
+
" def sample(self):\n",
|
| 410 |
+
" \"\"\"\n",
|
| 411 |
+
" samples value\n",
|
| 412 |
+
" \"\"\"\n",
|
| 413 |
+
" return selectRandomFromList(self.cvalues)\t\n",
|
| 414 |
+
"\n",
|
| 415 |
+
"class NormalSampler:\n",
|
| 416 |
+
" \"\"\"\n",
|
| 417 |
+
" normal sampler\n",
|
| 418 |
+
" \"\"\"\n",
|
| 419 |
+
" def __init__(self, mean, stdDev):\n",
|
| 420 |
+
" \"\"\"\n",
|
| 421 |
+
" initializer\n",
|
| 422 |
+
"\n",
|
| 423 |
+
" Parameters\n",
|
| 424 |
+
" mean : mean\n",
|
| 425 |
+
" stdDev : std deviation\n",
|
| 426 |
+
" \"\"\"\n",
|
| 427 |
+
" self.mean = mean\n",
|
| 428 |
+
" self.stdDev = stdDev\n",
|
| 429 |
+
" self.sampleAsInt = False\n",
|
| 430 |
+
"\n",
|
| 431 |
+
" def isNumeric(self):\n",
|
| 432 |
+
" return True\n",
|
| 433 |
+
"\n",
|
| 434 |
+
" def sampleAsIntValue(self):\n",
|
| 435 |
+
" \"\"\"\n",
|
| 436 |
+
" set True to sample as int\n",
|
| 437 |
+
" \"\"\"\n",
|
| 438 |
+
" self.sampleAsInt = True\n",
|
| 439 |
+
"\n",
|
| 440 |
+
" def sample(self):\n",
|
| 441 |
+
" \"\"\"\n",
|
| 442 |
+
" samples value\n",
|
| 443 |
+
" \"\"\"\n",
|
| 444 |
+
" samp = np.random.normal(self.mean, self.stdDev)\n",
|
| 445 |
+
" if self.sampleAsInt:\n",
|
| 446 |
+
" samp = int(samp)\n",
|
| 447 |
+
" return samp\n",
|
| 448 |
+
"\n",
|
| 449 |
+
"class LogNormalSampler:\n",
|
| 450 |
+
" \"\"\"\n",
|
| 451 |
+
" log normal sampler\n",
|
| 452 |
+
" \"\"\"\n",
|
| 453 |
+
" def __init__(self, mean, stdDev):\n",
|
| 454 |
+
" \"\"\"\n",
|
| 455 |
+
" initializer\n",
|
| 456 |
+
"\n",
|
| 457 |
+
" Parameters\n",
|
| 458 |
+
" mean : mean\n",
|
| 459 |
+
" stdDev : std deviation\n",
|
| 460 |
+
" \"\"\"\n",
|
| 461 |
+
" self.mean = mean\n",
|
| 462 |
+
" self.stdDev = stdDev\n",
|
| 463 |
+
"\n",
|
| 464 |
+
" def isNumeric(self):\n",
|
| 465 |
+
" return True\n",
|
| 466 |
+
"\n",
|
| 467 |
+
" def sample(self):\n",
|
| 468 |
+
" \"\"\"\n",
|
| 469 |
+
" samples value\n",
|
| 470 |
+
" \"\"\"\n",
|
| 471 |
+
" return np.random.lognormal(self.mean, self.stdDev)\n",
|
| 472 |
+
"\n",
|
| 473 |
+
"class NormalSamplerWithTrendCycle:\n",
|
| 474 |
+
" \"\"\"\n",
|
| 475 |
+
" normal sampler with cycle and trend\n",
|
| 476 |
+
" \"\"\"\n",
|
| 477 |
+
" def __init__(self, mean, stdDev, dmean, cycle, step=1):\n",
|
| 478 |
+
" \"\"\"\n",
|
| 479 |
+
" initializer\n",
|
| 480 |
+
"\n",
|
| 481 |
+
" Parameters\n",
|
| 482 |
+
" mean : mean\n",
|
| 483 |
+
" stdDev : std deviation\n",
|
| 484 |
+
" dmean : trend delta\n",
|
| 485 |
+
" cycle : cycle values wrt base mean\n",
|
| 486 |
+
" step : adjustment step for cycle and trend\n",
|
| 487 |
+
" \"\"\"\n",
|
| 488 |
+
" self.mean = mean\n",
|
| 489 |
+
" self.cmean = mean\n",
|
| 490 |
+
" self.stdDev = stdDev\n",
|
| 491 |
+
" self.dmean = dmean\n",
|
| 492 |
+
" self.cycle = cycle\n",
|
| 493 |
+
" self.clen = len(cycle) if cycle is not None else 0\n",
|
| 494 |
+
" self.step = step\n",
|
| 495 |
+
" self.count = 0\n",
|
| 496 |
+
"\n",
|
| 497 |
+
" def isNumeric(self):\n",
|
| 498 |
+
" return True\n",
|
| 499 |
+
"\n",
|
| 500 |
+
" def sample(self):\n",
|
| 501 |
+
" \"\"\"\n",
|
| 502 |
+
" samples value\n",
|
| 503 |
+
" \"\"\"\n",
|
| 504 |
+
" s = np.random.normal(self.cmean, self.stdDev)\n",
|
| 505 |
+
" self.count += 1\n",
|
| 506 |
+
" if self.count % self.step == 0:\n",
|
| 507 |
+
" cy = 0\n",
|
| 508 |
+
" if self.clen > 1:\n",
|
| 509 |
+
" coff = self.count % self.clen\n",
|
| 510 |
+
" cy = self.cycle[coff]\n",
|
| 511 |
+
" tr = self.count * self.dmean\n",
|
| 512 |
+
" self.cmean = self.mean + tr + cy\n",
|
| 513 |
+
" return s\n",
|
| 514 |
+
"\n",
|
| 515 |
+
"\n",
|
| 516 |
+
"class ParetoSampler:\n",
|
| 517 |
+
" \"\"\"\n",
|
| 518 |
+
" pareto sampler\n",
|
| 519 |
+
" \"\"\"\n",
|
| 520 |
+
" def __init__(self, mode, shape):\n",
|
| 521 |
+
" \"\"\"\n",
|
| 522 |
+
" initializer\n",
|
| 523 |
+
"\n",
|
| 524 |
+
" Parameters\n",
|
| 525 |
+
" mode : mode\n",
|
| 526 |
+
" shape : shape\n",
|
| 527 |
+
" \"\"\"\n",
|
| 528 |
+
" self.mode = mode\n",
|
| 529 |
+
" self.shape = shape\n",
|
| 530 |
+
"\n",
|
| 531 |
+
" def isNumeric(self):\n",
|
| 532 |
+
" return True\n",
|
| 533 |
+
"\n",
|
| 534 |
+
" def sample(self):\n",
|
| 535 |
+
" \"\"\"\n",
|
| 536 |
+
" samples value\n",
|
| 537 |
+
" \"\"\"\n",
|
| 538 |
+
" return (np.random.pareto(self.shape) + 1) * self.mode\n",
|
| 539 |
+
"\n",
|
| 540 |
+
"class GammaSampler:\n",
|
| 541 |
+
" \"\"\"\n",
|
| 542 |
+
" pareto sampler\n",
|
| 543 |
+
" \"\"\"\n",
|
| 544 |
+
" def __init__(self, shape, scale):\n",
|
| 545 |
+
" \"\"\"\n",
|
| 546 |
+
" initializer\n",
|
| 547 |
+
"\n",
|
| 548 |
+
" Parameters\n",
|
| 549 |
+
" shape : shape\n",
|
| 550 |
+
" scale : scale\n",
|
| 551 |
+
" \"\"\"\n",
|
| 552 |
+
" self.shape = shape\n",
|
| 553 |
+
" self.scale = scale\n",
|
| 554 |
+
"\n",
|
| 555 |
+
" def isNumeric(self):\n",
|
| 556 |
+
" return True\n",
|
| 557 |
+
"\n",
|
| 558 |
+
" def sample(self):\n",
|
| 559 |
+
" \"\"\"\n",
|
| 560 |
+
" samples value\n",
|
| 561 |
+
" \"\"\"\n",
|
| 562 |
+
" return np.random.gamma(self.shape, self.scale)\n",
|
| 563 |
+
"\n",
|
| 564 |
+
"class GaussianRejectSampler:\n",
|
| 565 |
+
" \"\"\"\n",
|
| 566 |
+
" gaussian sampling based on rejection sampling\n",
|
| 567 |
+
" \"\"\"\n",
|
| 568 |
+
" def __init__(self, mean, stdDev):\n",
|
| 569 |
+
" \"\"\"\n",
|
| 570 |
+
" initializer\n",
|
| 571 |
+
"\n",
|
| 572 |
+
" Parameters\n",
|
| 573 |
+
" mean : mean\n",
|
| 574 |
+
" stdDev : std deviation\n",
|
| 575 |
+
" \"\"\"\n",
|
| 576 |
+
" self.mean = mean\n",
|
| 577 |
+
" self.stdDev = stdDev\n",
|
| 578 |
+
" self.xmin = mean - 3 * stdDev\n",
|
| 579 |
+
" self.xmax = mean + 3 * stdDev\n",
|
| 580 |
+
" self.ymin = 0.0\n",
|
| 581 |
+
" self.fmax = 1.0 / (math.sqrt(2.0 * 3.14) * stdDev)\n",
|
| 582 |
+
" self.ymax = 1.05 * self.fmax\n",
|
| 583 |
+
" self.sampleAsInt = False\n",
|
| 584 |
+
"\n",
|
| 585 |
+
" def isNumeric(self):\n",
|
| 586 |
+
" return True\n",
|
| 587 |
+
"\n",
|
| 588 |
+
" def sampleAsIntValue(self):\n",
|
| 589 |
+
" \"\"\"\n",
|
| 590 |
+
" sample as int value\n",
|
| 591 |
+
" \"\"\"\n",
|
| 592 |
+
" self.sampleAsInt = True\n",
|
| 593 |
+
"\n",
|
| 594 |
+
" def sample(self):\n",
|
| 595 |
+
" \"\"\"\n",
|
| 596 |
+
" samples value\n",
|
| 597 |
+
" \"\"\"\n",
|
| 598 |
+
" done = False\n",
|
| 599 |
+
" samp = 0\n",
|
| 600 |
+
" while not done:\n",
|
| 601 |
+
" x = randomFloat(self.xmin, self.xmax)\n",
|
| 602 |
+
" y = randomFloat(self.ymin, self.ymax)\n",
|
| 603 |
+
" f = self.fmax * math.exp(-(x - self.mean) * (x - self.mean) / (2.0 * self.stdDev * self.stdDev))\n",
|
| 604 |
+
" if (y < f):\n",
|
| 605 |
+
" done = True\n",
|
| 606 |
+
" samp = x\n",
|
| 607 |
+
" if self.sampleAsInt:\n",
|
| 608 |
+
" samp = int(samp)\n",
|
| 609 |
+
" return samp\n",
|
| 610 |
+
"\n",
|
| 611 |
+
"class DiscreteRejectSampler:\n",
|
| 612 |
+
" \"\"\"\n",
|
| 613 |
+
" non parametric sampling for discrete values using given distribution based \n",
|
| 614 |
+
" on rejection sampling\t\n",
|
| 615 |
+
" \"\"\"\n",
|
| 616 |
+
" def __init__(self, xmin, xmax, step, *values):\n",
|
| 617 |
+
" \"\"\"\n",
|
| 618 |
+
" initializer\n",
|
| 619 |
+
"\n",
|
| 620 |
+
" Parameters\n",
|
| 621 |
+
" xmin : min value\n",
|
| 622 |
+
" xmax : max value\n",
|
| 623 |
+
" step : discrete step\n",
|
| 624 |
+
" values : distr values\n",
|
| 625 |
+
" \"\"\"\n",
|
| 626 |
+
" self.xmin = xmin\n",
|
| 627 |
+
" self.xmax = xmax\n",
|
| 628 |
+
" self.step = step\n",
|
| 629 |
+
" self.distr = values\n",
|
| 630 |
+
" if (len(self.distr) == 1):\n",
|
| 631 |
+
" self.distr = self.distr[0]\t\n",
|
| 632 |
+
" numSteps = int((self.xmax - self.xmin) / self.step)\n",
|
| 633 |
+
" #print(\"{:.3f} {:.3f} {:.3f} {}\".format(self.xmin, self.xmax, self.step, numSteps))\n",
|
| 634 |
+
" assert len(self.distr)\t== numSteps + 1, \"invalid number of distr values expected {}\".format(numSteps + 1)\n",
|
| 635 |
+
" self.ximin = 0\n",
|
| 636 |
+
" self.ximax = numSteps\n",
|
| 637 |
+
" self.pmax = float(max(self.distr))\n",
|
| 638 |
+
"\n",
|
| 639 |
+
" def isNumeric(self):\n",
|
| 640 |
+
" return True\n",
|
| 641 |
+
"\n",
|
| 642 |
+
" def sample(self):\n",
|
| 643 |
+
" \"\"\"\n",
|
| 644 |
+
" samples value\n",
|
| 645 |
+
" \"\"\"\n",
|
| 646 |
+
" done = False\n",
|
| 647 |
+
" samp = None\n",
|
| 648 |
+
" while not done:\n",
|
| 649 |
+
" xi = randint(self.ximin, self.ximax)\n",
|
| 650 |
+
" #print(formatAny(xi, \"xi\"))\n",
|
| 651 |
+
" ps = randomFloat(0.0, self.pmax)\n",
|
| 652 |
+
" pa = self.distr[xi]\n",
|
| 653 |
+
" if ps < pa:\n",
|
| 654 |
+
" samp = self.xmin + xi * self.step\n",
|
| 655 |
+
" done = True\n",
|
| 656 |
+
" return samp\n",
|
| 657 |
+
"\n",
|
| 658 |
+
"\n",
|
| 659 |
+
"class TriangularRejectSampler:\n",
|
| 660 |
+
" \"\"\"\n",
|
| 661 |
+
" non parametric sampling using triangular distribution based on rejection sampling\t\n",
|
| 662 |
+
" \"\"\"\n",
|
| 663 |
+
" def __init__(self, xmin, xmax, vertexValue, vertexPos=None):\n",
|
| 664 |
+
" \"\"\"\n",
|
| 665 |
+
" initializer\n",
|
| 666 |
+
"\n",
|
| 667 |
+
" Parameters\n",
|
| 668 |
+
" xmin : min value\n",
|
| 669 |
+
" xmax : max value\n",
|
| 670 |
+
" vertexValue : distr value at vertex\n",
|
| 671 |
+
" vertexPos : vertex pposition\n",
|
| 672 |
+
" \"\"\"\n",
|
| 673 |
+
" self.xmin = xmin\n",
|
| 674 |
+
" self.xmax = xmax\n",
|
| 675 |
+
" self.vertexValue = vertexValue\n",
|
| 676 |
+
" if vertexPos: \n",
|
| 677 |
+
" assert vertexPos > xmin and vertexPos < xmax, \"vertex position outside bound\"\n",
|
| 678 |
+
" self.vertexPos = vertexPos\n",
|
| 679 |
+
" else:\n",
|
| 680 |
+
" self.vertexPos = 0.5 * (xmin + xmax)\n",
|
| 681 |
+
" self.s1 = vertexValue / (self.vertexPos - xmin)\n",
|
| 682 |
+
" self.s2 = vertexValue / (xmax - self.vertexPos)\n",
|
| 683 |
+
"\n",
|
| 684 |
+
" def isNumeric(self):\n",
|
| 685 |
+
" return True\n",
|
| 686 |
+
"\n",
|
| 687 |
+
" def sample(self):\n",
|
| 688 |
+
" \"\"\"\n",
|
| 689 |
+
" samples value\n",
|
| 690 |
+
" \"\"\"\n",
|
| 691 |
+
" done = False\n",
|
| 692 |
+
" samp = None\n",
|
| 693 |
+
" while not done:\n",
|
| 694 |
+
" x = randomFloat(self.xmin, self.xmax)\n",
|
| 695 |
+
" y = randomFloat(0.0, self.vertexValue)\n",
|
| 696 |
+
" f = (x - self.xmin) * self.s1 if x < self.vertexPos else (self.xmax - x) * self.s2\n",
|
| 697 |
+
" if (y < f):\n",
|
| 698 |
+
" done = True\n",
|
| 699 |
+
" samp = x\n",
|
| 700 |
+
"\n",
|
| 701 |
+
" return samp;\t\n",
|
| 702 |
+
"\n",
|
| 703 |
+
"class NonParamRejectSampler:\n",
|
| 704 |
+
" \"\"\"\n",
|
| 705 |
+
" non parametric sampling using given distribution based on rejection sampling\t\n",
|
| 706 |
+
" \"\"\"\n",
|
| 707 |
+
" def __init__(self, xmin, binWidth, *values):\n",
|
| 708 |
+
" \"\"\"\n",
|
| 709 |
+
" initializer\n",
|
| 710 |
+
"\n",
|
| 711 |
+
" Parameters\n",
|
| 712 |
+
" xmin : min value\n",
|
| 713 |
+
" binWidth : bin width\n",
|
| 714 |
+
" values : distr values\n",
|
| 715 |
+
" \"\"\"\n",
|
| 716 |
+
" self.values = values\n",
|
| 717 |
+
" if (len(self.values) == 1):\n",
|
| 718 |
+
" self.values = self.values[0]\n",
|
| 719 |
+
" self.xmin = xmin\n",
|
| 720 |
+
" self.xmax = xmin + binWidth * (len(self.values) - 1)\n",
|
| 721 |
+
" #print(self.xmin, self.xmax, binWidth)\n",
|
| 722 |
+
" self.binWidth = binWidth\n",
|
| 723 |
+
" self.fmax = 0\n",
|
| 724 |
+
" for v in self.values:\n",
|
| 725 |
+
" if (v > self.fmax):\n",
|
| 726 |
+
" self.fmax = v\n",
|
| 727 |
+
" self.ymin = 0\n",
|
| 728 |
+
" self.ymax = self.fmax\n",
|
| 729 |
+
" self.sampleAsInt = True\n",
|
| 730 |
+
"\n",
|
| 731 |
+
" def isNumeric(self):\n",
|
| 732 |
+
" return True\n",
|
| 733 |
+
"\n",
|
| 734 |
+
" def sampleAsFloat(self):\n",
|
| 735 |
+
" self.sampleAsInt = False\n",
|
| 736 |
+
"\n",
|
| 737 |
+
" def sample(self):\n",
|
| 738 |
+
" \"\"\"\n",
|
| 739 |
+
" samples value\n",
|
| 740 |
+
" \"\"\"\n",
|
| 741 |
+
" done = False\n",
|
| 742 |
+
" samp = 0\n",
|
| 743 |
+
" while not done:\n",
|
| 744 |
+
" if self.sampleAsInt:\n",
|
| 745 |
+
" x = random.randint(self.xmin, self.xmax)\n",
|
| 746 |
+
" y = random.randint(self.ymin, self.ymax)\n",
|
| 747 |
+
" else:\n",
|
| 748 |
+
" x = randomFloat(self.xmin, self.xmax)\n",
|
| 749 |
+
" y = randomFloat(self.ymin, self.ymax)\n",
|
| 750 |
+
" bin = int((x - self.xmin) / self.binWidth)\n",
|
| 751 |
+
" f = self.values[bin]\n",
|
| 752 |
+
" if (y < f):\n",
|
| 753 |
+
" done = True\n",
|
| 754 |
+
" samp = x\n",
|
| 755 |
+
" return samp\n",
|
| 756 |
+
"\n",
|
| 757 |
+
"class JointNonParamRejectSampler:\n",
|
| 758 |
+
" \"\"\"\n",
|
| 759 |
+
" non parametric sampling using given distribution based on rejection sampling\t\n",
|
| 760 |
+
" \"\"\"\n",
|
| 761 |
+
" def __init__(self, xmin, xbinWidth, xnbin, ymin, ybinWidth, ynbin, *values):\n",
|
| 762 |
+
" \"\"\"\n",
|
| 763 |
+
" initializer\n",
|
| 764 |
+
"\n",
|
| 765 |
+
" Parameters\n",
|
| 766 |
+
" xmin : min value for x\n",
|
| 767 |
+
" xbinWidth : bin width for x\n",
|
| 768 |
+
" xnbin : no of bins for x\n",
|
| 769 |
+
" ymin : min value for y\n",
|
| 770 |
+
" ybinWidth : bin width for y\n",
|
| 771 |
+
" ynbin : no of bins for y\n",
|
| 772 |
+
" values : distr values\n",
|
| 773 |
+
" \"\"\"\n",
|
| 774 |
+
" self.values = values\n",
|
| 775 |
+
" if (len(self.values) == 1):\n",
|
| 776 |
+
" self.values = self.values[0]\n",
|
| 777 |
+
" assert len(self.values) == xnbin * ynbin, \"wrong number of values for joint distr\"\n",
|
| 778 |
+
" self.xmin = xmin\n",
|
| 779 |
+
" self.xmax = xmin + xbinWidth * xnbin\n",
|
| 780 |
+
" self.xbinWidth = xbinWidth\n",
|
| 781 |
+
" self.ymin = ymin\n",
|
| 782 |
+
" self.ymax = ymin + ybinWidth * ynbin\n",
|
| 783 |
+
" self.ybinWidth = ybinWidth\n",
|
| 784 |
+
" self.pmax = max(self.values)\n",
|
| 785 |
+
" self.values = np.array(self.values).reshape(xnbin, ynbin)\n",
|
| 786 |
+
"\n",
|
| 787 |
+
" def isNumeric(self):\n",
|
| 788 |
+
" return True\n",
|
| 789 |
+
"\n",
|
| 790 |
+
" def sample(self):\n",
|
| 791 |
+
" \"\"\"\n",
|
| 792 |
+
" samples value\n",
|
| 793 |
+
" \"\"\"\n",
|
| 794 |
+
" done = False\n",
|
| 795 |
+
" samp = 0\n",
|
| 796 |
+
" while not done:\n",
|
| 797 |
+
" x = randomFloat(self.xmin, self.xmax)\n",
|
| 798 |
+
" y = randomFloat(self.ymin, self.ymax)\n",
|
| 799 |
+
" xbin = int((x - self.xmin) / self.xbinWidth)\n",
|
| 800 |
+
" ybin = int((y - self.ymin) / self.ybinWidth)\n",
|
| 801 |
+
" ap = self.values[xbin][ybin]\n",
|
| 802 |
+
" sp = randomFloat(0.0, self.pmax)\n",
|
| 803 |
+
" if (sp < ap):\n",
|
| 804 |
+
" done = True\n",
|
| 805 |
+
" samp = [x,y]\n",
|
| 806 |
+
" return samp\n",
|
| 807 |
+
"\n",
|
| 808 |
+
"\n",
|
| 809 |
+
"class JointNormalSampler:\n",
|
| 810 |
+
" \"\"\"\n",
|
| 811 |
+
" joint normal sampler\t\n",
|
| 812 |
+
" \"\"\"\n",
|
| 813 |
+
" def __init__(self, *values):\n",
|
| 814 |
+
" \"\"\"\n",
|
| 815 |
+
" initializer\n",
|
| 816 |
+
"\n",
|
| 817 |
+
" Parameters\n",
|
| 818 |
+
" values : 2 mean values followed by 4 values for covar matrix\n",
|
| 819 |
+
" \"\"\"\n",
|
| 820 |
+
" lvalues = list(values)\n",
|
| 821 |
+
" assert len(lvalues) == 6, \"incorrect number of arguments for joint normal sampler\"\n",
|
| 822 |
+
" mean = lvalues[:2]\n",
|
| 823 |
+
" self.mean = np.array(mean)\n",
|
| 824 |
+
" sd = lvalues[2:]\n",
|
| 825 |
+
" self.sd = np.array(sd).reshape(2,2)\n",
|
| 826 |
+
"\n",
|
| 827 |
+
" def isNumeric(self):\n",
|
| 828 |
+
" return True\n",
|
| 829 |
+
"\n",
|
| 830 |
+
" def sample(self):\n",
|
| 831 |
+
" \"\"\"\n",
|
| 832 |
+
" samples value\n",
|
| 833 |
+
" \"\"\"\n",
|
| 834 |
+
" return list(np.random.multivariate_normal(self.mean, self.sd))\n",
|
| 835 |
+
"\n",
|
| 836 |
+
"\n",
|
| 837 |
+
"class MultiVarNormalSampler:\n",
|
| 838 |
+
" \"\"\"\n",
|
| 839 |
+
" muti variate normal sampler\t\n",
|
| 840 |
+
" \"\"\"\n",
|
| 841 |
+
" def __init__(self, numVar, *values):\n",
|
| 842 |
+
" \"\"\"\n",
|
| 843 |
+
" initializer\n",
|
| 844 |
+
"\n",
|
| 845 |
+
" Parameters\n",
|
| 846 |
+
" numVar : no of variables\n",
|
| 847 |
+
" values : numVar mean values followed by numVar x numVar values for covar matrix\n",
|
| 848 |
+
" \"\"\"\n",
|
| 849 |
+
" lvalues = list(values)\n",
|
| 850 |
+
" assert len(lvalues) == numVar + numVar * numVar, \"incorrect number of arguments for multi var normal sampler\"\n",
|
| 851 |
+
" mean = lvalues[:numVar]\n",
|
| 852 |
+
" self.mean = np.array(mean)\n",
|
| 853 |
+
" sd = lvalues[numVar:]\n",
|
| 854 |
+
" self.sd = np.array(sd).reshape(numVar,numVar)\n",
|
| 855 |
+
"\n",
|
| 856 |
+
" def isNumeric(self):\n",
|
| 857 |
+
" return True\n",
|
| 858 |
+
"\n",
|
| 859 |
+
" def sample(self):\n",
|
| 860 |
+
" \"\"\"\n",
|
| 861 |
+
" samples value\n",
|
| 862 |
+
" \"\"\"\n",
|
| 863 |
+
" return list(np.random.multivariate_normal(self.mean, self.sd))\n",
|
| 864 |
+
"\n",
|
| 865 |
+
"class CategoricalRejectSampler:\n",
|
| 866 |
+
" \"\"\"\n",
|
| 867 |
+
" non parametric sampling for categorical attributes using given distribution based \n",
|
| 868 |
+
" on rejection sampling\t\n",
|
| 869 |
+
" \"\"\"\n",
|
| 870 |
+
" def __init__(self, *values):\n",
|
| 871 |
+
" \"\"\"\n",
|
| 872 |
+
" initializer\n",
|
| 873 |
+
"\n",
|
| 874 |
+
" Parameters\n",
|
| 875 |
+
" values : list of tuples which contains a categorical value and the corresponsding distr value\n",
|
| 876 |
+
" \"\"\"\n",
|
| 877 |
+
" self.distr = values\n",
|
| 878 |
+
" if (len(self.distr) == 1):\n",
|
| 879 |
+
" self.distr = self.distr[0]\n",
|
| 880 |
+
" maxv = 0\n",
|
| 881 |
+
" for t in self.distr:\n",
|
| 882 |
+
" if t[1] > maxv:\n",
|
| 883 |
+
" maxv = t[1]\n",
|
| 884 |
+
" self.maxv = maxv\n",
|
| 885 |
+
"\n",
|
| 886 |
+
" def sample(self):\n",
|
| 887 |
+
" \"\"\"\n",
|
| 888 |
+
" samples value\n",
|
| 889 |
+
" \"\"\"\n",
|
| 890 |
+
" done = False\n",
|
| 891 |
+
" samp = \"\"\n",
|
| 892 |
+
" while not done:\n",
|
| 893 |
+
" t = self.distr[randint(0, len(self.distr)-1)]\t\n",
|
| 894 |
+
" d = randomFloat(0, self.maxv)\t\n",
|
| 895 |
+
" if (d <= t[1]):\n",
|
| 896 |
+
" done = True\n",
|
| 897 |
+
" samp = t[0]\n",
|
| 898 |
+
" return samp\n",
|
| 899 |
+
"\n",
|
| 900 |
+
"\n",
|
| 901 |
+
"class DistrMixtureSampler:\n",
|
| 902 |
+
" \"\"\"\n",
|
| 903 |
+
" distr mixture sampler\n",
|
| 904 |
+
" \"\"\"\n",
|
| 905 |
+
" def __init__(self, mixtureWtDistr, *compDistr):\n",
|
| 906 |
+
" \"\"\"\n",
|
| 907 |
+
" initializer\n",
|
| 908 |
+
"\n",
|
| 909 |
+
" Parameters\n",
|
| 910 |
+
" mixtureWtDistr : sampler that returns index into sampler list\n",
|
| 911 |
+
" compDistr : sampler list\n",
|
| 912 |
+
" \"\"\"\n",
|
| 913 |
+
" self.mixtureWtDistr = mixtureWtDistr\n",
|
| 914 |
+
" self.compDistr = compDistr\n",
|
| 915 |
+
" if (len(self.compDistr) == 1):\n",
|
| 916 |
+
" self.compDistr = self.compDistr[0]\n",
|
| 917 |
+
"\n",
|
| 918 |
+
" def isNumeric(self):\n",
|
| 919 |
+
" return True\n",
|
| 920 |
+
"\n",
|
| 921 |
+
" def sample(self):\n",
|
| 922 |
+
" \"\"\"\n",
|
| 923 |
+
" samples value\n",
|
| 924 |
+
" \"\"\"\n",
|
| 925 |
+
" comp = self.mixtureWtDistr.sample()\n",
|
| 926 |
+
"\n",
|
| 927 |
+
" #sample sampled comp distr\n",
|
| 928 |
+
" return self.compDistr[comp].sample()\n",
|
| 929 |
+
"\n",
|
| 930 |
+
"class AncestralSampler:\n",
|
| 931 |
+
" \"\"\"\n",
|
| 932 |
+
" ancestral sampler using conditional distribution\n",
|
| 933 |
+
" \"\"\"\n",
|
| 934 |
+
" def __init__(self, parentDistr, childDistr, numChildren):\n",
|
| 935 |
+
" \"\"\"\n",
|
| 936 |
+
" initializer\n",
|
| 937 |
+
"\n",
|
| 938 |
+
" Parameters\n",
|
| 939 |
+
" parentDistr : parent distr\n",
|
| 940 |
+
" childDistr : childdren distribution dictionary\n",
|
| 941 |
+
" numChildren : no of children\n",
|
| 942 |
+
" \"\"\"\n",
|
| 943 |
+
" self.parentDistr = parentDistr\n",
|
| 944 |
+
" self.childDistr = childDistr\n",
|
| 945 |
+
" self.numChildren = numChildren\n",
|
| 946 |
+
"\n",
|
| 947 |
+
" def sample(self):\n",
|
| 948 |
+
" \"\"\"\n",
|
| 949 |
+
" samples value\n",
|
| 950 |
+
" \"\"\"\n",
|
| 951 |
+
" parent = self.parentDistr.sample()\n",
|
| 952 |
+
"\n",
|
| 953 |
+
" #sample all children conditioned on parent\n",
|
| 954 |
+
" children = []\n",
|
| 955 |
+
" for i in range(self.numChildren):\n",
|
| 956 |
+
" key = (parent, i)\n",
|
| 957 |
+
" child = self.childDistr[key].sample()\n",
|
| 958 |
+
" children.append(child)\n",
|
| 959 |
+
" return (parent, children)\n",
|
| 960 |
+
"\n",
|
| 961 |
+
"class ClusterSampler:\n",
|
| 962 |
+
" \"\"\"\n",
|
| 963 |
+
" sample cluster and then sample member of sampled cluster\n",
|
| 964 |
+
" \"\"\"\n",
|
| 965 |
+
" def __init__(self, clusters, *clustDistr):\n",
|
| 966 |
+
" \"\"\"\n",
|
| 967 |
+
" initializer\n",
|
| 968 |
+
"\n",
|
| 969 |
+
" Parameters\n",
|
| 970 |
+
" clusters : dictionary clusters\n",
|
| 971 |
+
" clustDistr : distr for clusters\n",
|
| 972 |
+
" \"\"\"\n",
|
| 973 |
+
" self.sampler = CategoricalRejectSampler(*clustDistr)\n",
|
| 974 |
+
" self.clusters = clusters\n",
|
| 975 |
+
"\n",
|
| 976 |
+
" def sample(self):\n",
|
| 977 |
+
" \"\"\"\n",
|
| 978 |
+
" samples value\n",
|
| 979 |
+
" \"\"\"\n",
|
| 980 |
+
" cluster = self.sampler.sample()\n",
|
| 981 |
+
" member = random.choice(self.clusters[cluster])\n",
|
| 982 |
+
" return (cluster, member)\n",
|
| 983 |
+
"\n",
|
| 984 |
+
"\n",
|
| 985 |
+
"class MetropolitanSampler:\n",
|
| 986 |
+
" \"\"\"\n",
|
| 987 |
+
" metropolitan sampler\t\n",
|
| 988 |
+
" \"\"\"\n",
|
| 989 |
+
" def __init__(self, propStdDev, min, binWidth, values):\n",
|
| 990 |
+
" \"\"\"\n",
|
| 991 |
+
" initializer\n",
|
| 992 |
+
"\n",
|
| 993 |
+
" Parameters\n",
|
| 994 |
+
" propStdDev : proposal distr std dev\n",
|
| 995 |
+
" min : min domain value for target distr\n",
|
| 996 |
+
" binWidth : bin width\n",
|
| 997 |
+
" values : target distr values\n",
|
| 998 |
+
" \"\"\"\n",
|
| 999 |
+
" self.targetDistr = Histogram.createInitialized(min, binWidth, values)\n",
|
| 1000 |
+
" self.propsalDistr = GaussianRejectSampler(0, propStdDev)\n",
|
| 1001 |
+
" self.proposalMixture = False\n",
|
| 1002 |
+
"\n",
|
| 1003 |
+
" # bootstrap sample\n",
|
| 1004 |
+
" (minv, maxv) = self.targetDistr.getMinMax()\n",
|
| 1005 |
+
" self.curSample = random.randint(minv, maxv)\n",
|
| 1006 |
+
" self.curDistr = self.targetDistr.value(self.curSample)\n",
|
| 1007 |
+
" self.transCount = 0\n",
|
| 1008 |
+
"\n",
|
| 1009 |
+
" def initialize(self):\n",
|
| 1010 |
+
" \"\"\"\n",
|
| 1011 |
+
" initialize\n",
|
| 1012 |
+
" \"\"\"\n",
|
| 1013 |
+
" (minv, maxv) = self.targetDistr.getMinMax()\n",
|
| 1014 |
+
" self.curSample = random.randint(minv, maxv)\n",
|
| 1015 |
+
" self.curDistr = self.targetDistr.value(self.curSample)\n",
|
| 1016 |
+
" self.transCount = 0\n",
|
| 1017 |
+
"\n",
|
| 1018 |
+
" def setProposalDistr(self, propsalDistr):\n",
|
| 1019 |
+
" \"\"\"\n",
|
| 1020 |
+
" set custom proposal distribution\n",
|
| 1021 |
+
" Parameters\n",
|
| 1022 |
+
" propsalDistr : proposal distribution\n",
|
| 1023 |
+
" \"\"\"\n",
|
| 1024 |
+
" self.propsalDistr = propsalDistr\n",
|
| 1025 |
+
"\n",
|
| 1026 |
+
"\n",
|
| 1027 |
+
" def setGlobalProposalDistr(self, globPropStdDev, proposalChoiceThreshold):\n",
|
| 1028 |
+
" \"\"\"\n",
|
| 1029 |
+
" set custom proposal distribution\n",
|
| 1030 |
+
" Parameters\n",
|
| 1031 |
+
" globPropStdDev : global proposal distr std deviation\n",
|
| 1032 |
+
" proposalChoiceThreshold : threshold for using global proposal distribution\n",
|
| 1033 |
+
" \"\"\"\n",
|
| 1034 |
+
" self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)\n",
|
| 1035 |
+
" self.proposalChoiceThreshold = proposalChoiceThreshold\n",
|
| 1036 |
+
" self.proposalMixture = True\n",
|
| 1037 |
+
"\n",
|
| 1038 |
+
" def sample(self):\n",
|
| 1039 |
+
" \"\"\"\n",
|
| 1040 |
+
" samples value\n",
|
| 1041 |
+
" \"\"\"\n",
|
| 1042 |
+
" nextSample = self.proposalSample(1)\n",
|
| 1043 |
+
" self.targetSample(nextSample)\n",
|
| 1044 |
+
" return self.curSample;\n",
|
| 1045 |
+
"\n",
|
| 1046 |
+
" def proposalSample(self, skip):\n",
|
| 1047 |
+
" \"\"\"\n",
|
| 1048 |
+
" sample from proposal distribution\n",
|
| 1049 |
+
" Parameters\n",
|
| 1050 |
+
" skip : no of samples to skip\n",
|
| 1051 |
+
" \"\"\"\n",
|
| 1052 |
+
" for i in range(skip):\n",
|
| 1053 |
+
" if not self.proposalMixture:\n",
|
| 1054 |
+
" #one proposal distr\n",
|
| 1055 |
+
" nextSample = self.curSample + self.propsalDistr.sample()\n",
|
| 1056 |
+
" nextSample = self.targetDistr.boundedValue(nextSample)\n",
|
| 1057 |
+
" else:\n",
|
| 1058 |
+
" #mixture of proposal distr\n",
|
| 1059 |
+
" if random.random() < self.proposalChoiceThreshold:\n",
|
| 1060 |
+
" nextSample = self.curSample + self.propsalDistr.sample()\n",
|
| 1061 |
+
" else:\n",
|
| 1062 |
+
" nextSample = self.curSample + self.globalProposalDistr.sample()\n",
|
| 1063 |
+
" nextSample = self.targetDistr.boundedValue(nextSample)\n",
|
| 1064 |
+
"\n",
|
| 1065 |
+
" return nextSample\n",
|
| 1066 |
+
"\n",
|
| 1067 |
+
" def targetSample(self, nextSample):\n",
|
| 1068 |
+
" \"\"\"\n",
|
| 1069 |
+
" target sample\n",
|
| 1070 |
+
" Parameters\n",
|
| 1071 |
+
" nextSample : proposal distr sample\n",
|
| 1072 |
+
" \"\"\"\n",
|
| 1073 |
+
" nextDistr = self.targetDistr.value(nextSample)\n",
|
| 1074 |
+
"\n",
|
| 1075 |
+
" transition = False\n",
|
| 1076 |
+
" if nextDistr > self.curDistr:\n",
|
| 1077 |
+
" transition = True\n",
|
| 1078 |
+
" else:\n",
|
| 1079 |
+
" distrRatio = float(nextDistr) / self.curDistr\n",
|
| 1080 |
+
" if random.random() < distrRatio:\n",
|
| 1081 |
+
" transition = True\n",
|
| 1082 |
+
"\n",
|
| 1083 |
+
" if transition:\n",
|
| 1084 |
+
" self.curSample = nextSample\n",
|
| 1085 |
+
" self.curDistr = nextDistr\n",
|
| 1086 |
+
" self.transCount += 1\n",
|
| 1087 |
+
"\n",
|
| 1088 |
+
"\n",
|
| 1089 |
+
" def subSample(self, skip):\n",
|
| 1090 |
+
" \"\"\"\n",
|
| 1091 |
+
" sub sample\n",
|
| 1092 |
+
" Parameters\n",
|
| 1093 |
+
" skip : no of samples to skip\n",
|
| 1094 |
+
" \"\"\"\n",
|
| 1095 |
+
" nextSample = self.proposalSample(skip)\n",
|
| 1096 |
+
" self.targetSample(nextSample)\n",
|
| 1097 |
+
" return self.curSample;\n",
|
| 1098 |
+
"\n",
|
| 1099 |
+
" def setMixtureProposal(self, globPropStdDev, mixtureThreshold):\n",
|
| 1100 |
+
" \"\"\"\n",
|
| 1101 |
+
" mixture proposal\n",
|
| 1102 |
+
" Parameters\n",
|
| 1103 |
+
" globPropStdDev : global proposal distr std deviation\n",
|
| 1104 |
+
" mixtureThreshold : threshold for using global proposal distribution\n",
|
| 1105 |
+
" \"\"\"\n",
|
| 1106 |
+
" self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)\n",
|
| 1107 |
+
" self.mixtureThreshold = mixtureThreshold\n",
|
| 1108 |
+
"\n",
|
| 1109 |
+
" def samplePropsal(self):\n",
|
| 1110 |
+
" \"\"\"\n",
|
| 1111 |
+
" sample from proposal distr\n",
|
| 1112 |
+
" \"\"\"\n",
|
| 1113 |
+
" if self.globalPropsalDistr is None:\n",
|
| 1114 |
+
" proposal = self.propsalDistr.sample()\n",
|
| 1115 |
+
" else:\n",
|
| 1116 |
+
" if random.random() < self.mixtureThreshold:\n",
|
| 1117 |
+
" proposal = self.propsalDistr.sample()\n",
|
| 1118 |
+
" else:\n",
|
| 1119 |
+
" proposal = self.globalProposalDistr.sample()\n",
|
| 1120 |
+
"\n",
|
| 1121 |
+
" return proposal\n",
|
| 1122 |
+
"\n",
|
| 1123 |
+
"class PermutationSampler:\n",
|
| 1124 |
+
" \"\"\"\n",
|
| 1125 |
+
" permutation sampler by shuffling a list\n",
|
| 1126 |
+
" \"\"\"\n",
|
| 1127 |
+
" def __init__(self):\n",
|
| 1128 |
+
" \"\"\"\n",
|
| 1129 |
+
" initialize\n",
|
| 1130 |
+
" \"\"\"\n",
|
| 1131 |
+
" self.values = None\n",
|
| 1132 |
+
" self.numShuffles = None\n",
|
| 1133 |
+
"\n",
|
| 1134 |
+
" @staticmethod\n",
|
| 1135 |
+
" def createSamplerWithValues(values, *numShuffles):\n",
|
| 1136 |
+
" \"\"\"\n",
|
| 1137 |
+
" creator with values\n",
|
| 1138 |
+
" Parameters\n",
|
| 1139 |
+
" values : list data\n",
|
| 1140 |
+
" numShuffles : no of shuffles or range of no of shuffles\n",
|
| 1141 |
+
" \"\"\"\n",
|
| 1142 |
+
" sampler = PermutationSampler()\n",
|
| 1143 |
+
" sampler.values = values\n",
|
| 1144 |
+
" sampler.numShuffles = numShuffles\n",
|
| 1145 |
+
" return sampler\n",
|
| 1146 |
+
"\n",
|
| 1147 |
+
" @staticmethod\n",
|
| 1148 |
+
" def createSamplerWithRange(minv, maxv, *numShuffles):\n",
|
| 1149 |
+
" \"\"\"\n",
|
| 1150 |
+
" creator with ramge min and max\n",
|
| 1151 |
+
"\n",
|
| 1152 |
+
" Parameters\n",
|
| 1153 |
+
" minv : min of range\n",
|
| 1154 |
+
" maxv : max of range\n",
|
| 1155 |
+
" numShuffles : no of shuffles or range of no of shuffles\n",
|
| 1156 |
+
" \"\"\"\n",
|
| 1157 |
+
" sampler = PermutationSampler()\n",
|
| 1158 |
+
" sampler.values = list(range(minv, maxv + 1))\n",
|
| 1159 |
+
" sampler.numShuffles = numShuffles\n",
|
| 1160 |
+
" return sampler\n",
|
| 1161 |
+
"\n",
|
| 1162 |
+
" def sample(self):\n",
|
| 1163 |
+
" \"\"\"\n",
|
| 1164 |
+
" sample new permutation\n",
|
| 1165 |
+
" \"\"\"\n",
|
| 1166 |
+
" cloned = self.values.copy()\n",
|
| 1167 |
+
" shuffle(cloned, *self.numShuffles)\n",
|
| 1168 |
+
" return cloned\n",
|
| 1169 |
+
"\n",
|
| 1170 |
+
"class SpikeyDataSampler:\n",
|
| 1171 |
+
" \"\"\"\n",
|
| 1172 |
+
" samples spikey data\n",
|
| 1173 |
+
" \"\"\"\n",
|
| 1174 |
+
" def __init__(self, intvMean, intvScale, distr, spikeValueMean, spikeValueStd, spikeMaxDuration, baseValue = 0):\n",
|
| 1175 |
+
" \"\"\"\n",
|
| 1176 |
+
" initializer\n",
|
| 1177 |
+
"\n",
|
| 1178 |
+
" Parameters\n",
|
| 1179 |
+
" intvMean : interval mean\n",
|
| 1180 |
+
" intvScale : interval std dev\n",
|
| 1181 |
+
" distr : type of distr for interval\n",
|
| 1182 |
+
" spikeValueMean : spike value mean\n",
|
| 1183 |
+
" spikeValueStd : spike value std dev\n",
|
| 1184 |
+
" spikeMaxDuration : max duration for spike\n",
|
| 1185 |
+
" baseValue : base or offset value\n",
|
| 1186 |
+
" \"\"\"\n",
|
| 1187 |
+
" if distr == \"norm\":\n",
|
| 1188 |
+
" self.intvSampler = NormalSampler(intvMean, intvScale)\n",
|
| 1189 |
+
" elif distr == \"expo\":\n",
|
| 1190 |
+
" rate = 1.0 / intvScale\n",
|
| 1191 |
+
" self.intvSampler = ExponentialSampler(rate)\n",
|
| 1192 |
+
" else:\n",
|
| 1193 |
+
" raise ValueError(\"invalid distribution\")\n",
|
| 1194 |
+
"\n",
|
| 1195 |
+
" self.spikeSampler = NormalSampler(spikeValueMean, spikeValueStd)\n",
|
| 1196 |
+
" self.spikeMaxDuration = spikeMaxDuration\n",
|
| 1197 |
+
" self.baseValue = baseValue\n",
|
| 1198 |
+
" self.inSpike = False\n",
|
| 1199 |
+
" self.spikeCount = 0\n",
|
| 1200 |
+
" self.baseCount = 0\n",
|
| 1201 |
+
" self.baseLength = int(self.intvSampler.sample())\n",
|
| 1202 |
+
" self.spikeValues = list()\n",
|
| 1203 |
+
" self.spikeLength = None\n",
|
| 1204 |
+
"\n",
|
| 1205 |
+
" def sample(self):\n",
|
| 1206 |
+
" \"\"\"\n",
|
| 1207 |
+
" sample new value\n",
|
| 1208 |
+
" \"\"\"\n",
|
| 1209 |
+
" if self.baseCount <= self.baseLength:\n",
|
| 1210 |
+
" sampled = self.baseValue\n",
|
| 1211 |
+
" self.baseCount += 1\n",
|
| 1212 |
+
" else:\n",
|
| 1213 |
+
" if not self.inSpike:\n",
|
| 1214 |
+
" #starting spike\n",
|
| 1215 |
+
" spikeVal = self.spikeSampler.sample()\n",
|
| 1216 |
+
" self.spikeLength = sampleUniform(1, self.spikeMaxDuration)\n",
|
| 1217 |
+
" spikeMaxPos = 0 if self.spikeLength == 1 else sampleUniform(0, self.spikeLength-1)\n",
|
| 1218 |
+
" self.spikeValues.clear()\n",
|
| 1219 |
+
" for i in range(self.spikeLength):\n",
|
| 1220 |
+
" if i < spikeMaxPos:\n",
|
| 1221 |
+
" frac = (i + 1) / (spikeMaxPos + 1)\n",
|
| 1222 |
+
" frac = sampleFloatFromBase(frac, 0.1 * frac)\n",
|
| 1223 |
+
" elif i > spikeMaxPos:\n",
|
| 1224 |
+
" frac = (self.spikeLength - i) / (self.spikeLength - spikeMaxPos)\n",
|
| 1225 |
+
" frac = sampleFloatFromBase(frac, 0.1 * frac)\n",
|
| 1226 |
+
" else:\n",
|
| 1227 |
+
" frac = 1.0\n",
|
| 1228 |
+
" self.spikeValues.append(frac * spikeVal)\n",
|
| 1229 |
+
" self.inSpike = True\n",
|
| 1230 |
+
" self.spikeCount = 0\n",
|
| 1231 |
+
"\n",
|
| 1232 |
+
"\n",
|
| 1233 |
+
" sampled = self.spikeValues[self.spikeCount]\n",
|
| 1234 |
+
" self.spikeCount += 1\n",
|
| 1235 |
+
"\n",
|
| 1236 |
+
" if self.spikeCount == self.spikeLength:\n",
|
| 1237 |
+
" #ending spike\n",
|
| 1238 |
+
" self.baseCount = 0\n",
|
| 1239 |
+
" self.baseLength = int(self.intvSampler.sample())\n",
|
| 1240 |
+
" self.inSpike = False\n",
|
| 1241 |
+
"\n",
|
| 1242 |
+
" return sampled\n",
|
| 1243 |
+
"\n",
|
| 1244 |
+
"\n",
|
| 1245 |
+
"class EventSampler:\n",
|
| 1246 |
+
" \"\"\"\n",
|
| 1247 |
+
" sample event\n",
|
| 1248 |
+
" \"\"\"\n",
|
| 1249 |
+
" def __init__(self, intvSampler, valSampler=None):\n",
|
| 1250 |
+
" \"\"\"\n",
|
| 1251 |
+
" initializer\n",
|
| 1252 |
+
"\n",
|
| 1253 |
+
" Parameters\n",
|
| 1254 |
+
" intvSampler : interval sampler\n",
|
| 1255 |
+
" valSampler : value sampler\n",
|
| 1256 |
+
" \"\"\"\n",
|
| 1257 |
+
" self.intvSampler = intvSampler\n",
|
| 1258 |
+
" self.valSampler = valSampler\n",
|
| 1259 |
+
" self.trigger = int(self.intvSampler.sample())\n",
|
| 1260 |
+
" self.count = 0\n",
|
| 1261 |
+
"\n",
|
| 1262 |
+
" def reset(self):\n",
|
| 1263 |
+
" \"\"\"\n",
|
| 1264 |
+
" reset trigger\n",
|
| 1265 |
+
" \"\"\"\n",
|
| 1266 |
+
" self.trigger = int(self.intvSampler.sample())\n",
|
| 1267 |
+
" self.count = 0\n",
|
| 1268 |
+
"\n",
|
| 1269 |
+
" def sample(self):\n",
|
| 1270 |
+
" \"\"\"\n",
|
| 1271 |
+
" sample event\n",
|
| 1272 |
+
" \"\"\"\n",
|
| 1273 |
+
" if self.count == self.trigger:\n",
|
| 1274 |
+
" sampled = self.valSampler.sample() if self.valSampler is not None else 1.0\n",
|
| 1275 |
+
" self.trigger = int(self.intvSampler.sample())\n",
|
| 1276 |
+
" self.count = 0\n",
|
| 1277 |
+
" else:\n",
|
| 1278 |
+
" sample = 0.0\n",
|
| 1279 |
+
" self.count += 1\n",
|
| 1280 |
+
" return sampled\n",
|
| 1281 |
+
"\n",
|
| 1282 |
+
"\n",
|
| 1283 |
+
"\n",
|
| 1284 |
+
"\n",
|
| 1285 |
+
"def createSampler(data):\n",
|
| 1286 |
+
" \"\"\"\n",
|
| 1287 |
+
" create sampler\n",
|
| 1288 |
+
"\n",
|
| 1289 |
+
" Parameters\n",
|
| 1290 |
+
" data : sampler description\n",
|
| 1291 |
+
" \"\"\"\n",
|
| 1292 |
+
" #print(data)\n",
|
| 1293 |
+
" items = data.split(\":\")\n",
|
| 1294 |
+
" size = len(items)\n",
|
| 1295 |
+
" dtype = items[-1]\n",
|
| 1296 |
+
" stype = items[-2]\n",
|
| 1297 |
+
" sampler = None\n",
|
| 1298 |
+
" if stype == \"uniform\":\n",
|
| 1299 |
+
" if dtype == \"int\":\n",
|
| 1300 |
+
" min = int(items[0])\n",
|
| 1301 |
+
" max = int(items[1])\n",
|
| 1302 |
+
" sampler = UniformNumericSampler(min, max)\n",
|
| 1303 |
+
" elif dtype == \"float\":\n",
|
| 1304 |
+
" min = float(items[0])\n",
|
| 1305 |
+
" max = float(items[1])\n",
|
| 1306 |
+
" sampler = UniformNumericSampler(min, max)\n",
|
| 1307 |
+
" elif dtype == \"categorical\":\n",
|
| 1308 |
+
" values = items[:-2]\n",
|
| 1309 |
+
" sampler = UniformCategoricalSampler(values)\n",
|
| 1310 |
+
" elif stype == \"normal\":\n",
|
| 1311 |
+
" mean = float(items[0])\n",
|
| 1312 |
+
" sd = float(items[1])\n",
|
| 1313 |
+
" sampler = NormalSampler(mean, sd)\n",
|
| 1314 |
+
" if dtype == \"int\":\n",
|
| 1315 |
+
" sampler.sampleAsIntValue()\n",
|
| 1316 |
+
" elif stype == \"nonparam\":\n",
|
| 1317 |
+
" if dtype == \"int\" or dtype == \"float\":\n",
|
| 1318 |
+
" min = int(items[0])\n",
|
| 1319 |
+
" binWidth = int(items[1])\n",
|
| 1320 |
+
" values = items[2:-2]\n",
|
| 1321 |
+
" values = list(map(lambda v: int(v), values))\n",
|
| 1322 |
+
" sampler = NonParamRejectSampler(min, binWidth, values)\n",
|
| 1323 |
+
" if dtype == \"float\":\n",
|
| 1324 |
+
" sampler.sampleAsFloat()\n",
|
| 1325 |
+
" elif dtype == \"categorical\":\n",
|
| 1326 |
+
" values = list()\n",
|
| 1327 |
+
" for i in range(0, size-2, 2):\n",
|
| 1328 |
+
" cval = items[i]\n",
|
| 1329 |
+
" dist = int(items[i+1])\n",
|
| 1330 |
+
" pair = (cval, dist)\n",
|
| 1331 |
+
" values.append(pair)\n",
|
| 1332 |
+
" sampler = CategoricalRejectSampler(values)\n",
|
| 1333 |
+
" elif stype == \"discrete\":\n",
|
| 1334 |
+
" vmin = int(items[0])\n",
|
| 1335 |
+
" vmax = int(items[1])\n",
|
| 1336 |
+
" step = int(items[2])\n",
|
| 1337 |
+
" values = list(map(lambda i : int(items[i]), range(3, len(items)-2)))\n",
|
| 1338 |
+
" sampler = DiscreteRejectSampler(vmin, vmax, step, values)\n",
|
| 1339 |
+
" else:\n",
|
| 1340 |
+
" raise ValueError(\"invalid sampler type \" + dtype)\n",
|
| 1341 |
+
" return sampler\n"
|
| 1342 |
+
]
|
| 1343 |
+
}
|
| 1344 |
+
],
|
| 1345 |
+
"metadata": {
|
| 1346 |
+
"kernelspec": {
|
| 1347 |
+
"display_name": "Python 3 (ipykernel)",
|
| 1348 |
+
"language": "python",
|
| 1349 |
+
"name": "python3"
|
| 1350 |
+
},
|
| 1351 |
+
"language_info": {
|
| 1352 |
+
"codemirror_mode": {
|
| 1353 |
+
"name": "ipython",
|
| 1354 |
+
"version": 3
|
| 1355 |
+
},
|
| 1356 |
+
"file_extension": ".py",
|
| 1357 |
+
"mimetype": "text/x-python",
|
| 1358 |
+
"name": "python",
|
| 1359 |
+
"nbconvert_exporter": "python",
|
| 1360 |
+
"pygments_lexer": "ipython3",
|
| 1361 |
+
"version": "3.9.12"
|
| 1362 |
+
}
|
| 1363 |
+
},
|
| 1364 |
+
"nbformat": 4,
|
| 1365 |
+
"nbformat_minor": 5
|
| 1366 |
+
}
|
lib/stats.ipynb
ADDED
|
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "f4cbab42",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import sys\n",
|
| 11 |
+
"import random \n",
|
| 12 |
+
"import time\n",
|
| 13 |
+
"import math\n",
|
| 14 |
+
"import numpy as np\n",
|
| 15 |
+
"import statistics \n",
|
| 16 |
+
"from util import *\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"\"\"\"\n",
|
| 19 |
+
"histogram class\n",
|
| 20 |
+
"\"\"\"\n",
|
| 21 |
+
"class Histogram:\n",
|
| 22 |
+
" def __init__(self, min, binWidth):\n",
|
| 23 |
+
" \"\"\"\n",
|
| 24 |
+
" initializer\n",
|
| 25 |
+
"\n",
|
| 26 |
+
" Parameters\n",
|
| 27 |
+
" min : min x\n",
|
| 28 |
+
" binWidth : bin width\n",
|
| 29 |
+
" \"\"\"\n",
|
| 30 |
+
" self.xmin = min\n",
|
| 31 |
+
" self.binWidth = binWidth\n",
|
| 32 |
+
" self.normalized = False\n",
|
| 33 |
+
"\n",
|
| 34 |
+
" @classmethod\n",
|
| 35 |
+
" def createInitialized(cls, xmin, binWidth, values):\n",
|
| 36 |
+
" \"\"\"\n",
|
| 37 |
+
" create histogram instance with min domain, bin width and values\n",
|
| 38 |
+
"\n",
|
| 39 |
+
" Parameters\n",
|
| 40 |
+
" min : min x\n",
|
| 41 |
+
" binWidth : bin width\n",
|
| 42 |
+
" values : y values\n",
|
| 43 |
+
" \"\"\"\n",
|
| 44 |
+
" instance = cls(xmin, binWidth)\n",
|
| 45 |
+
" instance.xmax = xmin + binWidth * (len(values) - 1)\n",
|
| 46 |
+
" instance.ymin = 0\n",
|
| 47 |
+
" instance.bins = np.array(values)\n",
|
| 48 |
+
" instance.fmax = 0\n",
|
| 49 |
+
" for v in values:\n",
|
| 50 |
+
" if (v > instance.fmax):\n",
|
| 51 |
+
" instance.fmax = v\n",
|
| 52 |
+
" instance.ymin = 0.0\n",
|
| 53 |
+
" instance.ymax = instance.fmax\n",
|
| 54 |
+
" return instance\n",
|
| 55 |
+
"\n",
|
| 56 |
+
" @classmethod\n",
|
| 57 |
+
" def createWithNumBins(cls, values, numBins=20):\n",
|
| 58 |
+
" \"\"\"\n",
|
| 59 |
+
" create histogram instance values and no of bins\n",
|
| 60 |
+
"\n",
|
| 61 |
+
" Parameters\n",
|
| 62 |
+
" values : y values\n",
|
| 63 |
+
" numBins : no of bins\n",
|
| 64 |
+
" \"\"\"\n",
|
| 65 |
+
" xmin = min(values)\n",
|
| 66 |
+
" xmax = max(values)\n",
|
| 67 |
+
" binWidth = (xmax + .01 - (xmin - .01)) / numBins\n",
|
| 68 |
+
" instance = cls(xmin, binWidth)\n",
|
| 69 |
+
" instance.xmax = xmax\n",
|
| 70 |
+
" instance.numBin = numBins\n",
|
| 71 |
+
" instance.bins = np.zeros(instance.numBin)\n",
|
| 72 |
+
" for v in values:\n",
|
| 73 |
+
" instance.add(v)\n",
|
| 74 |
+
" return instance\n",
|
| 75 |
+
"\n",
|
| 76 |
+
" @classmethod\n",
|
| 77 |
+
" def createUninitialized(cls, xmin, xmax, binWidth):\n",
|
| 78 |
+
" \"\"\"\n",
|
| 79 |
+
" create histogram instance with no y values using domain min , max and bin width\n",
|
| 80 |
+
"\n",
|
| 81 |
+
" Parameters\n",
|
| 82 |
+
" min : min x\n",
|
| 83 |
+
" max : max x\n",
|
| 84 |
+
" binWidth : bin width\n",
|
| 85 |
+
" \"\"\"\n",
|
| 86 |
+
" instance = cls(xmin, binWidth)\n",
|
| 87 |
+
" instance.xmax = xmax\n",
|
| 88 |
+
" instance.numBin = (xmax - xmin) / binWidth + 1\n",
|
| 89 |
+
" instance.bins = np.zeros(instance.numBin)\n",
|
| 90 |
+
" return instance\n",
|
| 91 |
+
"\n",
|
| 92 |
+
" def initialize(self):\n",
|
| 93 |
+
" \"\"\"\n",
|
| 94 |
+
" set y values to 0\n",
|
| 95 |
+
" \"\"\"\n",
|
| 96 |
+
" self.bins = np.zeros(self.numBin)\n",
|
| 97 |
+
"\n",
|
| 98 |
+
" def add(self, value):\n",
|
| 99 |
+
" \"\"\"\n",
|
| 100 |
+
" adds a value to a bin\n",
|
| 101 |
+
"\n",
|
| 102 |
+
" Parameters\n",
|
| 103 |
+
" value : value\n",
|
| 104 |
+
" \"\"\"\n",
|
| 105 |
+
" bin = int((value - self.xmin) / self.binWidth)\n",
|
| 106 |
+
" if (bin < 0 or bin > self.numBin - 1):\n",
|
| 107 |
+
" print (bin)\n",
|
| 108 |
+
" raise ValueError(\"outside histogram range\")\n",
|
| 109 |
+
" self.bins[bin] += 1.0\n",
|
| 110 |
+
"\n",
|
| 111 |
+
" def normalize(self):\n",
|
| 112 |
+
" \"\"\"\n",
|
| 113 |
+
" normalize bin counts\n",
|
| 114 |
+
" \"\"\"\n",
|
| 115 |
+
" if not self.normalized:\n",
|
| 116 |
+
" total = self.bins.sum()\n",
|
| 117 |
+
" self.bins = np.divide(self.bins, total)\n",
|
| 118 |
+
" self.normalized = True\n",
|
| 119 |
+
"\n",
|
| 120 |
+
" def cumDistr(self):\n",
|
| 121 |
+
" \"\"\"\n",
|
| 122 |
+
" cumulative dists\n",
|
| 123 |
+
" \"\"\"\n",
|
| 124 |
+
" self.normalize()\n",
|
| 125 |
+
" self.cbins = np.cumsum(self.bins)\n",
|
| 126 |
+
" return self.cbins\n",
|
| 127 |
+
"\n",
|
| 128 |
+
" def distr(self):\n",
|
| 129 |
+
" \"\"\"\n",
|
| 130 |
+
" distr\n",
|
| 131 |
+
" \"\"\"\n",
|
| 132 |
+
" self.normalize()\n",
|
| 133 |
+
" return self.bins\n",
|
| 134 |
+
"\n",
|
| 135 |
+
"\n",
|
| 136 |
+
" def percentile(self, percent):\n",
|
| 137 |
+
" \"\"\"\n",
|
| 138 |
+
" return value corresponding to a percentile\n",
|
| 139 |
+
"\n",
|
| 140 |
+
" Parameters\n",
|
| 141 |
+
" percent : percentile value\n",
|
| 142 |
+
" \"\"\"\n",
|
| 143 |
+
" if self.cbins is None:\n",
|
| 144 |
+
" raise ValueError(\"cumulative distribution is not available\")\n",
|
| 145 |
+
"\n",
|
| 146 |
+
" for i,cuml in enumerate(self.cbins):\n",
|
| 147 |
+
" if percent > cuml:\n",
|
| 148 |
+
" value = (i * self.binWidth) - (self.binWidth / 2) + \\\n",
|
| 149 |
+
" (percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1]) \n",
|
| 150 |
+
" break\n",
|
| 151 |
+
" return value\n",
|
| 152 |
+
"\n",
|
| 153 |
+
" def max(self):\n",
|
| 154 |
+
" \"\"\"\n",
|
| 155 |
+
" return max bin value \n",
|
| 156 |
+
" \"\"\"\n",
|
| 157 |
+
" return self.bins.max()\n",
|
| 158 |
+
"\n",
|
| 159 |
+
" def value(self, x):\n",
|
| 160 |
+
" \"\"\"\n",
|
| 161 |
+
" return a bin value\t\n",
|
| 162 |
+
"\n",
|
| 163 |
+
" Parameters\n",
|
| 164 |
+
" x : x value\n",
|
| 165 |
+
" \"\"\"\n",
|
| 166 |
+
" bin = int((x - self.xmin) / self.binWidth)\n",
|
| 167 |
+
" f = self.bins[bin]\n",
|
| 168 |
+
" return f\n",
|
| 169 |
+
"\n",
|
| 170 |
+
" def bin(self, x):\n",
|
| 171 |
+
" \"\"\"\n",
|
| 172 |
+
" return a bin index\t\n",
|
| 173 |
+
"\n",
|
| 174 |
+
" Parameters\n",
|
| 175 |
+
" x : x value\n",
|
| 176 |
+
" \"\"\"\n",
|
| 177 |
+
" return int((x - self.xmin) / self.binWidth)\n",
|
| 178 |
+
"\n",
|
| 179 |
+
" def cumValue(self, x):\n",
|
| 180 |
+
" \"\"\"\n",
|
| 181 |
+
" return a cumulative bin value\t\n",
|
| 182 |
+
"\n",
|
| 183 |
+
" Parameters\n",
|
| 184 |
+
" x : x value\n",
|
| 185 |
+
" \"\"\"\n",
|
| 186 |
+
" bin = int((x - self.xmin) / self.binWidth)\n",
|
| 187 |
+
" c = self.cbins[bin]\n",
|
| 188 |
+
" return c\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"\n",
|
| 191 |
+
" def getMinMax(self):\n",
|
| 192 |
+
" \"\"\"\n",
|
| 193 |
+
" returns x min and x max\n",
|
| 194 |
+
" \"\"\"\n",
|
| 195 |
+
" return (self.xmin, self.xmax)\n",
|
| 196 |
+
"\n",
|
| 197 |
+
" def boundedValue(self, x):\n",
|
| 198 |
+
" \"\"\"\n",
|
| 199 |
+
" return x bounde by min and max\t\n",
|
| 200 |
+
"\n",
|
| 201 |
+
" Parameters\n",
|
| 202 |
+
" x : x value\n",
|
| 203 |
+
" \"\"\"\n",
|
| 204 |
+
" if x < self.xmin:\n",
|
| 205 |
+
" x = self.xmin\n",
|
| 206 |
+
" elif x > self.xmax:\n",
|
| 207 |
+
" x = self.xmax\n",
|
| 208 |
+
" return x\n",
|
| 209 |
+
"\n",
|
| 210 |
+
"\"\"\"\n",
|
| 211 |
+
"categorical histogram class\n",
|
| 212 |
+
"\"\"\"\n",
|
| 213 |
+
"class CatHistogram:\n",
|
| 214 |
+
" def __init__(self):\n",
|
| 215 |
+
" \"\"\"\n",
|
| 216 |
+
" initializer\n",
|
| 217 |
+
" \"\"\"\n",
|
| 218 |
+
" self.binCounts = dict()\n",
|
| 219 |
+
" self.counts = 0\n",
|
| 220 |
+
" self.normalized = False\n",
|
| 221 |
+
"\n",
|
| 222 |
+
" def add(self, value):\n",
|
| 223 |
+
" \"\"\"\n",
|
| 224 |
+
" adds a value to a bin\n",
|
| 225 |
+
"\n",
|
| 226 |
+
" Parameters\n",
|
| 227 |
+
" x : x value\n",
|
| 228 |
+
" \"\"\"\n",
|
| 229 |
+
" addToKeyedCounter(self.binCounts, value)\n",
|
| 230 |
+
" self.counts += 1\t\n",
|
| 231 |
+
"\n",
|
| 232 |
+
" def normalize(self):\n",
|
| 233 |
+
" \"\"\"\n",
|
| 234 |
+
" normalize\n",
|
| 235 |
+
" \"\"\"\n",
|
| 236 |
+
" if not self.normalized:\n",
|
| 237 |
+
" self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))\n",
|
| 238 |
+
" self.normalized = True\n",
|
| 239 |
+
"\n",
|
| 240 |
+
" def getMode(self):\n",
|
| 241 |
+
" \"\"\"\n",
|
| 242 |
+
" get mode\n",
|
| 243 |
+
" \"\"\"\n",
|
| 244 |
+
" maxk = None\n",
|
| 245 |
+
" maxv = 0\n",
|
| 246 |
+
" #print(self.binCounts)\n",
|
| 247 |
+
" for k,v in self.binCounts.items():\n",
|
| 248 |
+
" if v > maxv:\n",
|
| 249 |
+
" maxk = k\n",
|
| 250 |
+
" maxv = v\n",
|
| 251 |
+
" return (maxk, maxv)\t\n",
|
| 252 |
+
"\n",
|
| 253 |
+
" def getEntropy(self):\n",
|
| 254 |
+
" \"\"\"\n",
|
| 255 |
+
" get entropy\n",
|
| 256 |
+
" \"\"\"\n",
|
| 257 |
+
" self.normalize()\n",
|
| 258 |
+
" entr = 0 \n",
|
| 259 |
+
" #print(self.binCounts)\n",
|
| 260 |
+
" for k,v in self.binCounts.items():\n",
|
| 261 |
+
" entr -= v * math.log(v)\n",
|
| 262 |
+
" return entr\n",
|
| 263 |
+
"\n",
|
| 264 |
+
" def getUniqueValues(self):\n",
|
| 265 |
+
" \"\"\"\n",
|
| 266 |
+
" get unique values\n",
|
| 267 |
+
" \"\"\"\t\t\n",
|
| 268 |
+
" return list(self.binCounts.keys())\n",
|
| 269 |
+
"\n",
|
| 270 |
+
" def getDistr(self):\n",
|
| 271 |
+
" \"\"\"\n",
|
| 272 |
+
" get distribution\n",
|
| 273 |
+
" \"\"\"\t\n",
|
| 274 |
+
" self.normalize()\t\n",
|
| 275 |
+
" return self.binCounts.copy()\n",
|
| 276 |
+
"\n",
|
| 277 |
+
"class RunningStat:\n",
|
| 278 |
+
" \"\"\"\n",
|
| 279 |
+
" running stat class\n",
|
| 280 |
+
" \"\"\"\n",
|
| 281 |
+
" def __init__(self):\n",
|
| 282 |
+
" \"\"\"\n",
|
| 283 |
+
" initializer\t\n",
|
| 284 |
+
" \"\"\"\n",
|
| 285 |
+
" self.sum = 0.0\n",
|
| 286 |
+
" self.sumSq = 0.0\n",
|
| 287 |
+
" self.count = 0\n",
|
| 288 |
+
"\n",
|
| 289 |
+
" @staticmethod\n",
|
| 290 |
+
" def create(count, sum, sumSq):\n",
|
| 291 |
+
" \"\"\"\n",
|
| 292 |
+
" creates iinstance\t\n",
|
| 293 |
+
"\n",
|
| 294 |
+
" Parameters\n",
|
| 295 |
+
" sum : sum of values\n",
|
| 296 |
+
" sumSq : sum of valure squared\n",
|
| 297 |
+
" \"\"\"\n",
|
| 298 |
+
" rs = RunningStat()\n",
|
| 299 |
+
" rs.sum = sum\n",
|
| 300 |
+
" rs.sumSq = sumSq\n",
|
| 301 |
+
" rs.count = count\n",
|
| 302 |
+
" return rs\n",
|
| 303 |
+
"\n",
|
| 304 |
+
" def add(self, value):\n",
|
| 305 |
+
" \"\"\"\n",
|
| 306 |
+
" adds new value\n",
|
| 307 |
+
" Parameters\n",
|
| 308 |
+
" value : value to add\n",
|
| 309 |
+
" \"\"\"\n",
|
| 310 |
+
" self.sum += value\n",
|
| 311 |
+
" self.sumSq += (value * value)\n",
|
| 312 |
+
" self.count += 1\n",
|
| 313 |
+
"\n",
|
| 314 |
+
" def getStat(self):\n",
|
| 315 |
+
" \"\"\"\n",
|
| 316 |
+
" return mean and std deviation \n",
|
| 317 |
+
" \"\"\"\n",
|
| 318 |
+
" mean = self.sum /self. count\n",
|
| 319 |
+
" t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
|
| 320 |
+
" sd = math.sqrt(t)\n",
|
| 321 |
+
" re = (mean, sd)\n",
|
| 322 |
+
" return re\n",
|
| 323 |
+
"\n",
|
| 324 |
+
" def addGetStat(self,value):\n",
|
| 325 |
+
" \"\"\"\n",
|
| 326 |
+
" calculate mean and std deviation with new value added\n",
|
| 327 |
+
" Parameters\n",
|
| 328 |
+
" value : value to add\n",
|
| 329 |
+
" \"\"\"\n",
|
| 330 |
+
" self.add(value)\n",
|
| 331 |
+
" re = self.getStat()\n",
|
| 332 |
+
" return re\n",
|
| 333 |
+
"\n",
|
| 334 |
+
" def getCount(self):\n",
|
| 335 |
+
" \"\"\"\n",
|
| 336 |
+
" return count\n",
|
| 337 |
+
" \"\"\"\n",
|
| 338 |
+
" return self.count\n",
|
| 339 |
+
"\n",
|
| 340 |
+
" def getState(self):\n",
|
| 341 |
+
" \"\"\"\n",
|
| 342 |
+
" return state\n",
|
| 343 |
+
" \"\"\"\n",
|
| 344 |
+
" s = (self.count, self.sum, self.sumSq)\n",
|
| 345 |
+
" return s\n",
|
| 346 |
+
"\n",
|
| 347 |
+
"class SlidingWindowStat:\n",
|
| 348 |
+
" \"\"\"\n",
|
| 349 |
+
" sliding window stats\n",
|
| 350 |
+
" \"\"\"\n",
|
| 351 |
+
" def __init__(self):\n",
|
| 352 |
+
" \"\"\"\n",
|
| 353 |
+
" initializer\n",
|
| 354 |
+
" \"\"\"\n",
|
| 355 |
+
" self.sum = 0.0\n",
|
| 356 |
+
" self.sumSq = 0.0\n",
|
| 357 |
+
" self.count = 0\n",
|
| 358 |
+
" self.values = None\n",
|
| 359 |
+
"\n",
|
| 360 |
+
" @staticmethod\n",
|
| 361 |
+
" def create(values, sum, sumSq):\n",
|
| 362 |
+
" \"\"\"\n",
|
| 363 |
+
" creates iinstance\t\n",
|
| 364 |
+
"\n",
|
| 365 |
+
" Parameters\n",
|
| 366 |
+
" sum : sum of values\n",
|
| 367 |
+
" sumSq : sum of valure squared\n",
|
| 368 |
+
" \"\"\"\n",
|
| 369 |
+
" sws = SlidingWindowStat()\n",
|
| 370 |
+
" sws.sum = sum\n",
|
| 371 |
+
" sws.sumSq = sumSq\n",
|
| 372 |
+
" self.values = values.copy()\n",
|
| 373 |
+
" sws.count = len(self.values)\n",
|
| 374 |
+
" return sws\n",
|
| 375 |
+
"\n",
|
| 376 |
+
" @staticmethod\n",
|
| 377 |
+
" def initialize(values):\n",
|
| 378 |
+
" \"\"\"\n",
|
| 379 |
+
" creates iinstance\t\n",
|
| 380 |
+
"\n",
|
| 381 |
+
" Parameters\n",
|
| 382 |
+
" values : list of values\n",
|
| 383 |
+
" \"\"\"\n",
|
| 384 |
+
" sws = SlidingWindowStat()\n",
|
| 385 |
+
" sws.values = values.copy()\n",
|
| 386 |
+
" for v in sws.values:\n",
|
| 387 |
+
" sws.sum += v\n",
|
| 388 |
+
" sws.sumSq += v * v\t\t\n",
|
| 389 |
+
" sws.count = len(sws.values)\n",
|
| 390 |
+
" return sws\n",
|
| 391 |
+
"\n",
|
| 392 |
+
" @staticmethod\n",
|
| 393 |
+
" def createEmpty(count):\n",
|
| 394 |
+
" \"\"\"\n",
|
| 395 |
+
" creates iinstance\t\n",
|
| 396 |
+
"\n",
|
| 397 |
+
" Parameters\n",
|
| 398 |
+
" count : count of values\n",
|
| 399 |
+
" \"\"\"\n",
|
| 400 |
+
" sws = SlidingWindowStat()\n",
|
| 401 |
+
" sws.count = count\n",
|
| 402 |
+
" sws.values = list()\n",
|
| 403 |
+
" return sws\n",
|
| 404 |
+
"\n",
|
| 405 |
+
" def add(self, value):\n",
|
| 406 |
+
" \"\"\"\n",
|
| 407 |
+
" adds new value\n",
|
| 408 |
+
"\n",
|
| 409 |
+
" Parameters\n",
|
| 410 |
+
" value : value to add\n",
|
| 411 |
+
" \"\"\"\n",
|
| 412 |
+
" self.values.append(value)\t\t\n",
|
| 413 |
+
" if len(self.values) > self.count:\n",
|
| 414 |
+
" self.sum += value - self.values[0]\n",
|
| 415 |
+
" self.sumSq += (value * value) - (self.values[0] * self.values[0])\n",
|
| 416 |
+
" self.values.pop(0)\n",
|
| 417 |
+
" else:\n",
|
| 418 |
+
" self.sum += value\n",
|
| 419 |
+
" self.sumSq += (value * value)\n",
|
| 420 |
+
"\n",
|
| 421 |
+
"\n",
|
| 422 |
+
" def getStat(self):\n",
|
| 423 |
+
" \"\"\"\n",
|
| 424 |
+
" calculate mean and std deviation \n",
|
| 425 |
+
" \"\"\"\n",
|
| 426 |
+
" mean = self.sum /self. count\n",
|
| 427 |
+
" t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
|
| 428 |
+
" sd = math.sqrt(t)\n",
|
| 429 |
+
" re = (mean, sd)\n",
|
| 430 |
+
" return re\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" def addGetStat(self,value):\n",
|
| 433 |
+
" \"\"\"\n",
|
| 434 |
+
" calculate mean and std deviation with new value added\n",
|
| 435 |
+
" \"\"\"\n",
|
| 436 |
+
" self.add(value)\n",
|
| 437 |
+
" re = self.getStat()\n",
|
| 438 |
+
" return re\n",
|
| 439 |
+
"\n",
|
| 440 |
+
" def getCount(self):\n",
|
| 441 |
+
" \"\"\"\n",
|
| 442 |
+
" return count\n",
|
| 443 |
+
" \"\"\"\n",
|
| 444 |
+
" return self.count\n",
|
| 445 |
+
"\n",
|
| 446 |
+
" def getCurSize(self):\n",
|
| 447 |
+
" \"\"\"\n",
|
| 448 |
+
" return count\n",
|
| 449 |
+
" \"\"\"\n",
|
| 450 |
+
" return len(self.values)\n",
|
| 451 |
+
"\n",
|
| 452 |
+
" def getState(self):\n",
|
| 453 |
+
" \"\"\"\n",
|
| 454 |
+
" return state\n",
|
| 455 |
+
" \"\"\"\n",
|
| 456 |
+
" s = (self.count, self.sum, self.sumSq)\n",
|
| 457 |
+
" return s\n",
|
| 458 |
+
"\n",
|
| 459 |
+
"\n",
|
| 460 |
+
"def basicStat(ldata):\n",
|
| 461 |
+
" \"\"\"\n",
|
| 462 |
+
" mean and std dev\n",
|
| 463 |
+
" Parameters\n",
|
| 464 |
+
" ldata : list of values\n",
|
| 465 |
+
" \"\"\"\n",
|
| 466 |
+
" m = statistics.mean(ldata)\n",
|
| 467 |
+
" s = statistics.stdev(ldata, xbar=m)\n",
|
| 468 |
+
" r = (m, s)\n",
|
| 469 |
+
" return r\n",
|
| 470 |
+
"\n",
|
| 471 |
+
"def getFileColumnStat(filePath, col, delem=\",\"):\n",
|
| 472 |
+
" \"\"\"\n",
|
| 473 |
+
" gets stats for a file column\n",
|
| 474 |
+
"\n",
|
| 475 |
+
" Parameters\n",
|
| 476 |
+
" filePath : file path\n",
|
| 477 |
+
" col : col index\n",
|
| 478 |
+
" delem : field delemter\n",
|
| 479 |
+
" \"\"\"\n",
|
| 480 |
+
" rs = RunningStat()\n",
|
| 481 |
+
" for rec in fileRecGen(filePath, delem):\n",
|
| 482 |
+
" va = float(rec[col])\n",
|
| 483 |
+
" rs.add(va)\n",
|
| 484 |
+
"\n",
|
| 485 |
+
" return rs.getStat()\n"
|
| 486 |
+
]
|
| 487 |
+
}
|
| 488 |
+
],
|
| 489 |
+
"metadata": {
|
| 490 |
+
"kernelspec": {
|
| 491 |
+
"display_name": "Python 3 (ipykernel)",
|
| 492 |
+
"language": "python",
|
| 493 |
+
"name": "python3"
|
| 494 |
+
},
|
| 495 |
+
"language_info": {
|
| 496 |
+
"codemirror_mode": {
|
| 497 |
+
"name": "ipython",
|
| 498 |
+
"version": 3
|
| 499 |
+
},
|
| 500 |
+
"file_extension": ".py",
|
| 501 |
+
"mimetype": "text/x-python",
|
| 502 |
+
"name": "python",
|
| 503 |
+
"nbconvert_exporter": "python",
|
| 504 |
+
"pygments_lexer": "ipython3",
|
| 505 |
+
"version": "3.9.12"
|
| 506 |
+
}
|
| 507 |
+
},
|
| 508 |
+
"nbformat": 4,
|
| 509 |
+
"nbformat_minor": 5
|
| 510 |
+
}
|
lib/tnn.ipynb
ADDED
|
@@ -0,0 +1,800 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "3853095d",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"import matplotlib.pyplot as plt\n",
|
| 13 |
+
"import numpy as np\n",
|
| 14 |
+
"import torch\n",
|
| 15 |
+
"from torch.autograd import Variable\n",
|
| 16 |
+
"from torch.utils.data import Dataset, TensorDataset\n",
|
| 17 |
+
"from torch.utils.data import DataLoader\n",
|
| 18 |
+
"import sklearn as sk\n",
|
| 19 |
+
"from sklearn.neighbors import KDTree\n",
|
| 20 |
+
"import matplotlib\n",
|
| 21 |
+
"import random\n",
|
| 22 |
+
"import jprops\n",
|
| 23 |
+
"from random import randint\n",
|
| 24 |
+
"import statistics\n",
|
| 25 |
+
"sys.path.append(os.path.abspath(\"../lib\"))\n",
|
| 26 |
+
"from util import *\n",
|
| 27 |
+
"from mlutil import *\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"\"\"\"\n",
|
| 30 |
+
"forward hook function\n",
|
| 31 |
+
"\"\"\"\n",
|
| 32 |
+
"intermedOut = {}\n",
|
| 33 |
+
"lvalues = list()\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"def hookFn(m, i, o):\n",
|
| 36 |
+
" \"\"\"\n",
|
| 37 |
+
" call back for latent values\n",
|
| 38 |
+
" \"\"\"\n",
|
| 39 |
+
" #intermedOut[m] = o\n",
|
| 40 |
+
" lv = o.data.cpu().numpy()\n",
|
| 41 |
+
" lv = lv[0].tolist()\n",
|
| 42 |
+
" lvalues.append(lv)\n",
|
| 43 |
+
" #print(lv)\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"def getLatValues():\n",
|
| 46 |
+
" \"\"\"\n",
|
| 47 |
+
" \"\"\"\n",
|
| 48 |
+
" return lvalues\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"class FeedForwardNetwork(torch.nn.Module):\n",
|
| 51 |
+
" def __init__(self, configFile, addDefValues=None):\n",
|
| 52 |
+
" \"\"\"\n",
|
| 53 |
+
" In the constructor we instantiate two nn.Linear modules and assign them as\n",
|
| 54 |
+
" member variables.\n",
|
| 55 |
+
"\n",
|
| 56 |
+
" Parameters\n",
|
| 57 |
+
" configFile : config file path\n",
|
| 58 |
+
" addDefValues : dictionary of additional default values\t\n",
|
| 59 |
+
" \"\"\"\n",
|
| 60 |
+
" defValues = dict() if addDefValues is None else addDefValues.copy()\n",
|
| 61 |
+
" defValues[\"common.mode\"] = (\"training\", None)\n",
|
| 62 |
+
" defValues[\"common.model.directory\"] = (\"model\", None)\n",
|
| 63 |
+
" defValues[\"common.model.file\"] = (None, None)\n",
|
| 64 |
+
" defValues[\"common.preprocessing\"] = (None, None)\n",
|
| 65 |
+
" defValues[\"common.scaling.method\"] = (\"zscale\", None)\n",
|
| 66 |
+
" defValues[\"common.scaling.minrows\"] = (50, None)\n",
|
| 67 |
+
" defValues[\"common.scaling.param.file\"] = (None, None)\n",
|
| 68 |
+
" defValues[\"common.verbose\"] = (False, None)\n",
|
| 69 |
+
" defValues[\"common.device\"] = (\"cpu\", None)\n",
|
| 70 |
+
" defValues[\"train.data.file\"] = (None, \"missing training data file\")\n",
|
| 71 |
+
" defValues[\"train.data.fields\"] = (None, \"missing training data field ordinals\")\n",
|
| 72 |
+
" defValues[\"train.data.feature.fields\"] = (None, \"missing training data feature field ordinals\")\n",
|
| 73 |
+
" defValues[\"train.data.out.fields\"] = (None, \"missing training data feature field ordinals\")\n",
|
| 74 |
+
" defValues[\"train.layer.data\"] = (None, \"missing layer data\")\n",
|
| 75 |
+
" defValues[\"train.input.size\"] = (None, None)\n",
|
| 76 |
+
" defValues[\"train.output.size\"] = (None, \"missing output size\")\n",
|
| 77 |
+
" defValues[\"train.batch.size\"] = (10, None)\n",
|
| 78 |
+
" defValues[\"train.loss.reduction\"] = (\"mean\", None)\n",
|
| 79 |
+
" defValues[\"train.num.iterations\"] = (500, None)\n",
|
| 80 |
+
" defValues[\"train.lossFn\"] = (\"mse\", None) \n",
|
| 81 |
+
" defValues[\"train.optimizer\"] = (\"sgd\", None) \n",
|
| 82 |
+
" defValues[\"train.opt.learning.rate\"] = (.0001, None)\n",
|
| 83 |
+
" defValues[\"train.opt.weight.decay\"] = (0, None) \n",
|
| 84 |
+
" defValues[\"train.opt.momentum\"] = (0, None) \n",
|
| 85 |
+
" defValues[\"train.opt.eps\"] = (1e-08, None) \n",
|
| 86 |
+
" defValues[\"train.opt.dampening\"] = (0, None) \n",
|
| 87 |
+
" defValues[\"train.opt.momentum.nesterov\"] = (False, None) \n",
|
| 88 |
+
" defValues[\"train.opt.betas\"] = ([0.9, 0.999], None) \n",
|
| 89 |
+
" defValues[\"train.opt.alpha\"] = (0.99, None) \n",
|
| 90 |
+
" defValues[\"train.save.model\"] = (False, None) \n",
|
| 91 |
+
" defValues[\"train.track.error\"] = (False, None) \n",
|
| 92 |
+
" defValues[\"train.epoch.intv\"] = (5, None) \n",
|
| 93 |
+
" defValues[\"train.batch.intv\"] = (5, None) \n",
|
| 94 |
+
" defValues[\"train.print.weights\"] = (False, None) \n",
|
| 95 |
+
" defValues[\"valid.data.file\"] = (None, None)\n",
|
| 96 |
+
" defValues[\"valid.accuracy.metric\"] = (None, None)\n",
|
| 97 |
+
" defValues[\"predict.data.file\"] = (None, None)\n",
|
| 98 |
+
" defValues[\"predict.use.saved.model\"] = (True, None)\n",
|
| 99 |
+
" defValues[\"predict.output\"] = (\"binary\", None)\n",
|
| 100 |
+
" defValues[\"predict.feat.pad.size\"] = (60, None)\n",
|
| 101 |
+
" defValues[\"predict.print.output\"] = (True, None)\n",
|
| 102 |
+
" defValues[\"calibrate.num.bins\"] = (10, None)\n",
|
| 103 |
+
" defValues[\"calibrate.pred.prob.thresh\"] = (0.5, None)\n",
|
| 104 |
+
" defValues[\"calibrate.num.nearest.neighbors\"] = (10, None)\n",
|
| 105 |
+
" self.config = Configuration(configFile, defValues)\n",
|
| 106 |
+
"\n",
|
| 107 |
+
" super(FeedForwardNetwork, self).__init__()\n",
|
| 108 |
+
"\n",
|
| 109 |
+
" def setConfigParam(self, name, value):\n",
|
| 110 |
+
" \"\"\"\n",
|
| 111 |
+
" set config param\n",
|
| 112 |
+
"\n",
|
| 113 |
+
" Parameters\n",
|
| 114 |
+
" name : config name\n",
|
| 115 |
+
" value : config value\n",
|
| 116 |
+
" \"\"\"\n",
|
| 117 |
+
" self.config.setParam(name, value)\n",
|
| 118 |
+
"\n",
|
| 119 |
+
" def getConfig(self):\n",
|
| 120 |
+
" \"\"\"\n",
|
| 121 |
+
" get config object\n",
|
| 122 |
+
" \"\"\"\n",
|
| 123 |
+
" return self.config\n",
|
| 124 |
+
"\n",
|
| 125 |
+
" def setVerbose(self, verbose):\n",
|
| 126 |
+
" self.verbose = verbose\n",
|
| 127 |
+
"\n",
|
| 128 |
+
" def buildModel(self):\n",
|
| 129 |
+
" \"\"\"\n",
|
| 130 |
+
" Loads configuration and builds the various piecess necessary for the model\n",
|
| 131 |
+
" \"\"\"\n",
|
| 132 |
+
" torch.manual_seed(9999)\n",
|
| 133 |
+
"\n",
|
| 134 |
+
" self.verbose = self.config.getBooleanConfig(\"common.verbose\")[0]\n",
|
| 135 |
+
" numinp = self.config.getIntConfig(\"train.input.size\")[0]\n",
|
| 136 |
+
" if numinp is None:\n",
|
| 137 |
+
" numinp = len(self.config.getIntListConfig(\"train.data.feature.fields\")[0])\n",
|
| 138 |
+
" #numOut = len(self.config.getStringConfig(\"train.data.out.fields\")[0].split(\",\"))\n",
|
| 139 |
+
" self.outputSize = self.config.getIntConfig(\"train.output.size\")[0]\n",
|
| 140 |
+
" self.batchSize = self.config.getIntConfig(\"train.batch.size\")[0]\n",
|
| 141 |
+
" #lossRed = self.config.getStringConfig(\"train.loss.reduction\")[0]\n",
|
| 142 |
+
" #learnRate = self.config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
|
| 143 |
+
" self.numIter = self.config.getIntConfig(\"train.num.iterations\")[0]\n",
|
| 144 |
+
" optimizer = self.config.getStringConfig(\"train.optimizer\")[0]\n",
|
| 145 |
+
" self.lossFnStr = self.config.getStringConfig(\"train.lossFn\")[0]\n",
|
| 146 |
+
" self.accMetric = self.config.getStringConfig(\"valid.accuracy.metric\")[0]\n",
|
| 147 |
+
" self.trackErr = self.config.getBooleanConfig(\"train.track.error\")[0]\n",
|
| 148 |
+
" self.batchIntv = self.config.getIntConfig(\"train.batch.intv\")[0]\n",
|
| 149 |
+
" self.restored = False\n",
|
| 150 |
+
" self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None\n",
|
| 151 |
+
"\n",
|
| 152 |
+
" #build network\n",
|
| 153 |
+
" layers = list()\n",
|
| 154 |
+
" ninp = numinp\n",
|
| 155 |
+
" trData = self.config.getStringConfig(\"train.layer.data\")[0].split(\",\")\n",
|
| 156 |
+
" for ld in trData:\n",
|
| 157 |
+
" lde = ld.split(\":\")\n",
|
| 158 |
+
" assert len(lde) == 5, \"expecting 5 items for layer data\"\n",
|
| 159 |
+
"\n",
|
| 160 |
+
" #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction\n",
|
| 161 |
+
" nunit = int(lde[0])\n",
|
| 162 |
+
" actStr = lde[1]\n",
|
| 163 |
+
" act = FeedForwardNetwork.createActivation(actStr) if actStr != \"none\" else None\n",
|
| 164 |
+
" bnorm = lde[2] == \"true\"\n",
|
| 165 |
+
" afterAct = lde[3] == \"true\"\n",
|
| 166 |
+
" dpr = float(lde[4])\n",
|
| 167 |
+
"\n",
|
| 168 |
+
" layers.append(torch.nn.Linear(ninp, nunit))\t\t\t\n",
|
| 169 |
+
" if bnorm:\n",
|
| 170 |
+
" #with batch norm\n",
|
| 171 |
+
" if afterAct:\n",
|
| 172 |
+
" safeAppend(layers, act)\n",
|
| 173 |
+
" layers.append(torch.nn.BatchNorm1d(nunit))\n",
|
| 174 |
+
" else:\n",
|
| 175 |
+
" layers.append(torch.nn.BatchNorm1d(nunit))\n",
|
| 176 |
+
" safeAppend(layers, act)\n",
|
| 177 |
+
" else:\n",
|
| 178 |
+
" #without batch norm\n",
|
| 179 |
+
" safeAppend(layers, act)\n",
|
| 180 |
+
"\n",
|
| 181 |
+
" if dpr > 0:\n",
|
| 182 |
+
" layers.append(torch.nn.Dropout(dpr))\n",
|
| 183 |
+
" ninp = nunit\n",
|
| 184 |
+
"\n",
|
| 185 |
+
" self.layers = torch.nn.Sequential(*layers)\t\n",
|
| 186 |
+
"\n",
|
| 187 |
+
" self.device = FeedForwardNetwork.getDevice(self)\n",
|
| 188 |
+
"\n",
|
| 189 |
+
" #training data\n",
|
| 190 |
+
" dataFile = self.config.getStringConfig(\"train.data.file\")[0]\n",
|
| 191 |
+
" (featData, outData) = FeedForwardNetwork.prepData(self, dataFile)\n",
|
| 192 |
+
" self.featData = torch.from_numpy(featData)\n",
|
| 193 |
+
" self.outData = torch.from_numpy(outData)\n",
|
| 194 |
+
"\n",
|
| 195 |
+
" #validation data\n",
|
| 196 |
+
" dataFile = self.config.getStringConfig(\"valid.data.file\")[0]\n",
|
| 197 |
+
" (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)\n",
|
| 198 |
+
" self.validFeatData = torch.from_numpy(featDataV)\n",
|
| 199 |
+
" self.validOutData = torch.from_numpy(outDataV)\n",
|
| 200 |
+
"\n",
|
| 201 |
+
" # loss function and optimizer\n",
|
| 202 |
+
" self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)\n",
|
| 203 |
+
" self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)\n",
|
| 204 |
+
"\n",
|
| 205 |
+
" self.yPred = None\n",
|
| 206 |
+
" self.restored = False\n",
|
| 207 |
+
"\n",
|
| 208 |
+
" #mode to device\n",
|
| 209 |
+
" self.device = FeedForwardNetwork.getDevice(self)\t\n",
|
| 210 |
+
" self.featData = self.featData.to(self.device)\n",
|
| 211 |
+
" self.outData = self.outData.to(self.device)\n",
|
| 212 |
+
" self.validFeatData = self.validFeatData.to(self.device)\n",
|
| 213 |
+
" self.to(self.device)\n",
|
| 214 |
+
"\n",
|
| 215 |
+
" @staticmethod\n",
|
| 216 |
+
" def getDevice(model):\n",
|
| 217 |
+
" \"\"\"\n",
|
| 218 |
+
" gets device\n",
|
| 219 |
+
"\n",
|
| 220 |
+
" Parameters\n",
|
| 221 |
+
" model : torch model\n",
|
| 222 |
+
" \"\"\"\n",
|
| 223 |
+
" devType = model.config.getStringConfig(\"common.device\")[0]\n",
|
| 224 |
+
" if devType == \"cuda\":\n",
|
| 225 |
+
" if torch.cuda.is_available():\n",
|
| 226 |
+
" device = torch.device(\"cuda\")\n",
|
| 227 |
+
" else:\n",
|
| 228 |
+
" exitWithMsg(\"cuda not available\")\n",
|
| 229 |
+
" else:\n",
|
| 230 |
+
" device = torch.device(\"cpu\")\n",
|
| 231 |
+
" return device\n",
|
| 232 |
+
"\n",
|
| 233 |
+
" def setValidationData(self, dataSource, prep=True):\n",
|
| 234 |
+
" \"\"\"\n",
|
| 235 |
+
" sets validation data\n",
|
| 236 |
+
"\n",
|
| 237 |
+
" Parameters\n",
|
| 238 |
+
" dataSource : data source str if file path or 2D array\n",
|
| 239 |
+
" prep : if True load and prepare \n",
|
| 240 |
+
" \"\"\"\n",
|
| 241 |
+
" if prep:\n",
|
| 242 |
+
" (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)\n",
|
| 243 |
+
" self.validFeatData = torch.from_numpy(featDataV)\n",
|
| 244 |
+
" self.validOutData = outDataV\n",
|
| 245 |
+
" else:\n",
|
| 246 |
+
" self.validFeatData = torch.from_numpy(dataSource[0])\n",
|
| 247 |
+
" self.validOutData = dataSource[1]\t\t\n",
|
| 248 |
+
"\n",
|
| 249 |
+
" self.validFeatData = self.validFeatData.to(self.device)\n",
|
| 250 |
+
"\n",
|
| 251 |
+
" @staticmethod\n",
|
| 252 |
+
" def createActivation(actName):\n",
|
| 253 |
+
" \"\"\"\n",
|
| 254 |
+
" create activation\n",
|
| 255 |
+
"\n",
|
| 256 |
+
" Parameters\n",
|
| 257 |
+
" actName : activation name\n",
|
| 258 |
+
" \"\"\"\n",
|
| 259 |
+
" if actName is None:\n",
|
| 260 |
+
" activation = None\n",
|
| 261 |
+
" elif actName == \"relu\":\n",
|
| 262 |
+
" activation = torch.nn.ReLU()\n",
|
| 263 |
+
" elif actName == \"tanh\":\n",
|
| 264 |
+
" activation = torch.nn.Tanh()\n",
|
| 265 |
+
" elif actName == \"sigmoid\":\n",
|
| 266 |
+
" activation = torch.nn.Sigmoid()\n",
|
| 267 |
+
" elif actName == \"softmax\":\n",
|
| 268 |
+
" activation = torch.nn.Softmax(dim=1)\n",
|
| 269 |
+
" else:\n",
|
| 270 |
+
" exitWithMsg(\"invalid activation function name \" + actName)\n",
|
| 271 |
+
" return activation\n",
|
| 272 |
+
"\n",
|
| 273 |
+
" @staticmethod\n",
|
| 274 |
+
" def createLossFunction(model, lossFnName):\n",
|
| 275 |
+
" \"\"\"\n",
|
| 276 |
+
" create loss function\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" Parameters\n",
|
| 279 |
+
" lossFnName : loss function name\n",
|
| 280 |
+
" \"\"\"\n",
|
| 281 |
+
" config = model.config\n",
|
| 282 |
+
" lossRed = config.getStringConfig(\"train.loss.reduction\")[0]\n",
|
| 283 |
+
" if lossFnName == \"ltwo\" or lossFnName == \"mse\":\n",
|
| 284 |
+
" lossFunc = torch.nn.MSELoss(reduction=lossRed)\n",
|
| 285 |
+
" elif lossFnName == \"ce\":\n",
|
| 286 |
+
" lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)\n",
|
| 287 |
+
" elif lossFnName == \"lone\" or lossFnName == \"mae\":\n",
|
| 288 |
+
" lossFunc = torch.nn.L1Loss(reduction=lossRed)\n",
|
| 289 |
+
" elif lossFnName == \"bce\":\n",
|
| 290 |
+
" lossFunc = torch.nn.BCELoss(reduction=lossRed)\n",
|
| 291 |
+
" elif lossFnName == \"bcel\":\n",
|
| 292 |
+
" lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)\n",
|
| 293 |
+
" elif lossFnName == \"sm\":\n",
|
| 294 |
+
" lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)\n",
|
| 295 |
+
" elif lossFnName == \"mlsm\":\n",
|
| 296 |
+
" lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)\n",
|
| 297 |
+
" else:\n",
|
| 298 |
+
" exitWithMsg(\"invalid loss function name \" + lossFnName)\n",
|
| 299 |
+
" return lossFunc\n",
|
| 300 |
+
"\n",
|
| 301 |
+
" @staticmethod\n",
|
| 302 |
+
" def createOptimizer(model, optName):\n",
|
| 303 |
+
" \"\"\"\n",
|
| 304 |
+
" create optimizer\n",
|
| 305 |
+
"\n",
|
| 306 |
+
" Parameters\n",
|
| 307 |
+
" optName : optimizer name\n",
|
| 308 |
+
" \"\"\"\n",
|
| 309 |
+
" config = model.config\n",
|
| 310 |
+
" learnRate = config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
|
| 311 |
+
" weightDecay = config.getFloatConfig(\"train.opt.weight.decay\")[0]\n",
|
| 312 |
+
" momentum = config.getFloatConfig(\"train.opt.momentum\")[0]\n",
|
| 313 |
+
" eps = config.getFloatConfig(\"train.opt.eps\")[0]\n",
|
| 314 |
+
" if optName == \"sgd\":\n",
|
| 315 |
+
" dampening = config.getFloatConfig(\"train.opt.dampening\")[0]\n",
|
| 316 |
+
" momentumNesterov = config.getBooleanConfig(\"train.opt.momentum.nesterov\")[0]\n",
|
| 317 |
+
" optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum, \n",
|
| 318 |
+
" dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)\n",
|
| 319 |
+
" elif optName == \"adam\":\n",
|
| 320 |
+
" betas = config.getFloatListConfig(\"train.opt.betas\")[0]\n",
|
| 321 |
+
" betas = (betas[0], betas[1]) \n",
|
| 322 |
+
" optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,\n",
|
| 323 |
+
" weight_decay=weightDecay)\n",
|
| 324 |
+
" elif optName == \"rmsprop\":\n",
|
| 325 |
+
" alpha = config.getFloatConfig(\"train.opt.alpha\")[0]\n",
|
| 326 |
+
" optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,\n",
|
| 327 |
+
" eps=eps, weight_decay=weightDecay, momentum=momentum)\n",
|
| 328 |
+
" else:\n",
|
| 329 |
+
" exitWithMsg(\"invalid optimizer name \" + optName)\n",
|
| 330 |
+
" return optimizer\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"\n",
|
| 333 |
+
" def forward(self, x):\n",
|
| 334 |
+
" \"\"\"\n",
|
| 335 |
+
" In the forward function we accept a Tensor of input data and we must return\n",
|
| 336 |
+
" a Tensor of output data. We can use Modules defined in the constructor as\n",
|
| 337 |
+
" well as arbitrary (differentiable) operations on Tensors.\n",
|
| 338 |
+
"\n",
|
| 339 |
+
" Parameters\n",
|
| 340 |
+
" x : data batch\n",
|
| 341 |
+
" \"\"\"\n",
|
| 342 |
+
" y = self.layers(x)\t\n",
|
| 343 |
+
" return y\n",
|
| 344 |
+
"\n",
|
| 345 |
+
" @staticmethod\n",
|
| 346 |
+
" def addForwardHook(model, l, cl = 0):\n",
|
| 347 |
+
" \"\"\"\n",
|
| 348 |
+
" register forward hooks\n",
|
| 349 |
+
"\n",
|
| 350 |
+
" Parameters\n",
|
| 351 |
+
" l : \n",
|
| 352 |
+
" cl :\n",
|
| 353 |
+
" \"\"\"\n",
|
| 354 |
+
" for name, layer in model._modules.items():\n",
|
| 355 |
+
" #If it is a sequential, don't register a hook on it\n",
|
| 356 |
+
" # but recursively register hook on all it's module children\n",
|
| 357 |
+
" print(str(cl) + \" : \" + name)\n",
|
| 358 |
+
" if isinstance(layer, torch.nn.Sequential):\n",
|
| 359 |
+
" FeedForwardNetwork.addForwardHook(layer, l, cl)\n",
|
| 360 |
+
" else:\n",
|
| 361 |
+
" #\t it's a non sequential. Register a hook\n",
|
| 362 |
+
" if cl == l:\n",
|
| 363 |
+
" print(\"setting hook at layer \" + str(l))\n",
|
| 364 |
+
" layer.register_forward_hook(hookFn)\n",
|
| 365 |
+
" cl += 1\n",
|
| 366 |
+
"\n",
|
| 367 |
+
" @staticmethod\n",
|
| 368 |
+
" def prepData(model, dataSource, includeOutFld=True):\n",
|
| 369 |
+
" \"\"\"\n",
|
| 370 |
+
" loads and prepares data\n",
|
| 371 |
+
"\n",
|
| 372 |
+
" Parameters\n",
|
| 373 |
+
" dataSource : data source str if file path or 2D array\n",
|
| 374 |
+
" includeOutFld : True if target freld to be included\n",
|
| 375 |
+
" \"\"\"\n",
|
| 376 |
+
" # parameters\n",
|
| 377 |
+
" fieldIndices = model.config.getIntListConfig(\"train.data.fields\")[0]\n",
|
| 378 |
+
" featFieldIndices = model.config.getIntListConfig(\"train.data.feature.fields\")[0]\n",
|
| 379 |
+
"\n",
|
| 380 |
+
" #all data and feature data\n",
|
| 381 |
+
" isDataFile = isinstance(dataSource, str)\n",
|
| 382 |
+
" selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]\n",
|
| 383 |
+
" if isDataFile: \n",
|
| 384 |
+
" #source file path \n",
|
| 385 |
+
" (data, featData) = loadDataFile(dataSource, \",\", selFieldIndices, featFieldIndices)\n",
|
| 386 |
+
" else:\n",
|
| 387 |
+
" # tabular data\n",
|
| 388 |
+
" data = tableSelFieldsFilter(dataSource, selFieldIndices)\n",
|
| 389 |
+
" featData = tableSelFieldsFilter(data, featFieldIndices)\n",
|
| 390 |
+
" #print(featData)\n",
|
| 391 |
+
" featData = np.array(featData)\n",
|
| 392 |
+
"\n",
|
| 393 |
+
" if (model.config.getStringConfig(\"common.preprocessing\")[0] == \"scale\"):\n",
|
| 394 |
+
" scalingMethod = model.config.getStringConfig(\"common.scaling.method\")[0]\n",
|
| 395 |
+
"\n",
|
| 396 |
+
" #scale only if there are enough rows\n",
|
| 397 |
+
" nrow = featData.shape[0]\n",
|
| 398 |
+
" minrows = model.config.getIntConfig(\"common.scaling.minrows\")[0]\n",
|
| 399 |
+
" if nrow > minrows:\n",
|
| 400 |
+
" #in place scaling\n",
|
| 401 |
+
" featData = scaleData(featData, scalingMethod)\n",
|
| 402 |
+
" else:\n",
|
| 403 |
+
" #use pre computes scaling parameters\n",
|
| 404 |
+
" spFile = model.config.getStringConfig(\"common.scaling.param.file\")[0]\n",
|
| 405 |
+
" if spFile is None:\n",
|
| 406 |
+
" exitWithMsg(\"for small data sets pre computed scaling parameters need to provided\")\n",
|
| 407 |
+
" scParams = restoreObject(spFile)\n",
|
| 408 |
+
" featData = scaleDataWithParams(featData, scalingMethod, scParams)\n",
|
| 409 |
+
" featData = np.array(featData)\n",
|
| 410 |
+
"\n",
|
| 411 |
+
" # target data\n",
|
| 412 |
+
" if includeOutFld:\n",
|
| 413 |
+
" outFieldIndices = model.config.getStringConfig(\"train.data.out.fields\")[0]\n",
|
| 414 |
+
" outFieldIndices = strToIntArray(outFieldIndices, \",\")\n",
|
| 415 |
+
" if isDataFile:\n",
|
| 416 |
+
" outData = data[:,outFieldIndices]\n",
|
| 417 |
+
" else:\n",
|
| 418 |
+
" outData = tableSelFieldsFilter(data, outFieldIndices)\n",
|
| 419 |
+
" outData = np.array(outData)\n",
|
| 420 |
+
" foData = (featData.astype(np.float32), outData.astype(np.float32))\n",
|
| 421 |
+
" else:\n",
|
| 422 |
+
" foData = featData.astype(np.float32)\n",
|
| 423 |
+
" return foData\n",
|
| 424 |
+
"\n",
|
| 425 |
+
" @staticmethod\n",
|
| 426 |
+
" def saveCheckpt(model):\n",
|
| 427 |
+
" \"\"\"\n",
|
| 428 |
+
" checkpoints model\n",
|
| 429 |
+
"\n",
|
| 430 |
+
" Parameters\n",
|
| 431 |
+
" model : torch model\n",
|
| 432 |
+
" \"\"\"\n",
|
| 433 |
+
" print(\"..saving model checkpoint\")\n",
|
| 434 |
+
" modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
|
| 435 |
+
" assert os.path.exists(modelDirectory), \"model save directory does not exist\"\n",
|
| 436 |
+
" modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
|
| 437 |
+
" filepath = os.path.join(modelDirectory, modelFile)\n",
|
| 438 |
+
" state = {\"state_dict\": model.state_dict(), \"optim_dict\": model.optimizer.state_dict()}\n",
|
| 439 |
+
" torch.save(state, filepath)\n",
|
| 440 |
+
" if model.verbose:\n",
|
| 441 |
+
" print(\"model saved\")\n",
|
| 442 |
+
"\n",
|
| 443 |
+
" @staticmethod\n",
|
| 444 |
+
" def restoreCheckpt(model, loadOpt=False):\n",
|
| 445 |
+
" \"\"\"\n",
|
| 446 |
+
" restored checkpointed model\n",
|
| 447 |
+
"\n",
|
| 448 |
+
" Parameters\n",
|
| 449 |
+
" model : torch model\n",
|
| 450 |
+
" loadOpt : True if optimizer to be loaded\n",
|
| 451 |
+
" \"\"\"\n",
|
| 452 |
+
" if not model.restored:\n",
|
| 453 |
+
" print(\"..restoring model checkpoint\")\n",
|
| 454 |
+
" modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
|
| 455 |
+
" modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
|
| 456 |
+
" filepath = os.path.join(modelDirectory, modelFile)\n",
|
| 457 |
+
" assert os.path.exists(filepath), \"model save file does not exist\"\n",
|
| 458 |
+
" checkpoint = torch.load(filepath)\n",
|
| 459 |
+
" model.load_state_dict(checkpoint[\"state_dict\"])\n",
|
| 460 |
+
" model.to(model.device)\n",
|
| 461 |
+
" if loadOpt:\n",
|
| 462 |
+
" model.optimizer.load_state_dict(checkpoint[\"optim_dict\"])\n",
|
| 463 |
+
" model.restored = True\n",
|
| 464 |
+
"\n",
|
| 465 |
+
" @staticmethod\n",
|
| 466 |
+
" def processClassifOutput(yPred, config):\n",
|
| 467 |
+
" \"\"\"\n",
|
| 468 |
+
" extracts probability label 1 or label with highest probability\n",
|
| 469 |
+
"\n",
|
| 470 |
+
" Parameters\n",
|
| 471 |
+
" yPred : predicted output\n",
|
| 472 |
+
" config : config object\n",
|
| 473 |
+
" \"\"\"\n",
|
| 474 |
+
" outType = config.getStringConfig(\"predict.output\")[0]\n",
|
| 475 |
+
" if outType == \"prob\":\n",
|
| 476 |
+
" outputSize = config.getIntConfig(\"train.output.size\")[0]\n",
|
| 477 |
+
" if outputSize == 2:\n",
|
| 478 |
+
" #return prob of pos class for binary classifier \n",
|
| 479 |
+
" yPred = yPred[:, 1]\n",
|
| 480 |
+
" else:\n",
|
| 481 |
+
" #return class value and probability for multi classifier \n",
|
| 482 |
+
" yCl = np.argmax(yPred, axis=1)\n",
|
| 483 |
+
" yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))\n",
|
| 484 |
+
" yPred = zip(yCl, yPred)\n",
|
| 485 |
+
" else:\n",
|
| 486 |
+
" yPred = np.argmax(yPred, axis=1)\n",
|
| 487 |
+
" return yPred\n",
|
| 488 |
+
"\n",
|
| 489 |
+
" @staticmethod\n",
|
| 490 |
+
" def printPrediction(yPred, config, dataSource):\n",
|
| 491 |
+
" \"\"\"\n",
|
| 492 |
+
" prints input feature data and prediction\n",
|
| 493 |
+
"\n",
|
| 494 |
+
" Parameters\n",
|
| 495 |
+
" yPred : predicted output\n",
|
| 496 |
+
" config : config object\n",
|
| 497 |
+
" dataSource : data source str if file path or 2D array\n",
|
| 498 |
+
" \"\"\"\n",
|
| 499 |
+
" #prDataFilePath = config.getStringConfig(\"predict.data.file\")[0]\n",
|
| 500 |
+
" padWidth = config.getIntConfig(\"predict.feat.pad.size\")[0]\n",
|
| 501 |
+
" i = 0\n",
|
| 502 |
+
" if type(dataSource) == str:\n",
|
| 503 |
+
" for rec in fileRecGen(dataSource, \",\"):\n",
|
| 504 |
+
" feat = (\",\".join(rec)).ljust(padWidth, \" \")\n",
|
| 505 |
+
" rec = feat + \"\\t\" + str(yPred[i])\n",
|
| 506 |
+
" print(rec)\n",
|
| 507 |
+
" i += 1\n",
|
| 508 |
+
" else:\n",
|
| 509 |
+
" for rec in dataSource:\n",
|
| 510 |
+
" srec = toStrList(rec, 6)\n",
|
| 511 |
+
" feat = (\",\".join(srec)).ljust(padWidth, \" \")\n",
|
| 512 |
+
" srec = feat + \"\\t\" + str(yPred[i])\n",
|
| 513 |
+
" print(srec)\n",
|
| 514 |
+
" i += 1\n",
|
| 515 |
+
"\n",
|
| 516 |
+
"\n",
|
| 517 |
+
" @staticmethod\n",
|
| 518 |
+
" def allTrain(model):\n",
|
| 519 |
+
" \"\"\"\n",
|
| 520 |
+
" train with all data\n",
|
| 521 |
+
"\n",
|
| 522 |
+
" Parameters\n",
|
| 523 |
+
" model : torch model\n",
|
| 524 |
+
" \"\"\"\n",
|
| 525 |
+
" # train mode\n",
|
| 526 |
+
" model.train()\n",
|
| 527 |
+
" for t in range(model.numIter):\n",
|
| 528 |
+
"\n",
|
| 529 |
+
"\n",
|
| 530 |
+
" # Forward pass: Compute predicted y by passing x to the model\n",
|
| 531 |
+
" yPred = model(model.featData)\n",
|
| 532 |
+
"\n",
|
| 533 |
+
" # Compute and print loss\n",
|
| 534 |
+
" loss = model.lossFn(yPred, model.outData)\n",
|
| 535 |
+
" if model.verbose and t % 50 == 0:\n",
|
| 536 |
+
" print(\"epoch {} loss {:.6f}\".format(t, loss.item()))\n",
|
| 537 |
+
"\n",
|
| 538 |
+
" # Zero gradients, perform a backward pass, and update the weights.\n",
|
| 539 |
+
" model.optimizer.zero_grad()\n",
|
| 540 |
+
" loss.backward()\n",
|
| 541 |
+
" model.optimizer.step() \t\n",
|
| 542 |
+
"\n",
|
| 543 |
+
" #validate\n",
|
| 544 |
+
" model.eval()\n",
|
| 545 |
+
" yPred = model(model.validFeatData)\n",
|
| 546 |
+
" yPred = yPred.data.cpu().numpy()\n",
|
| 547 |
+
" yActual = model.validOutData\n",
|
| 548 |
+
" if model.verbose:\n",
|
| 549 |
+
" result = np.concatenate((yPred, yActual), axis = 1)\n",
|
| 550 |
+
" print(\"predicted actual\")\n",
|
| 551 |
+
" print(result)\n",
|
| 552 |
+
"\n",
|
| 553 |
+
" score = perfMetric(model.accMetric, yActual, yPred)\n",
|
| 554 |
+
" print(formatFloat(3, score, \"perf score\"))\n",
|
| 555 |
+
" return score\n",
|
| 556 |
+
"\n",
|
| 557 |
+
" @staticmethod\n",
|
| 558 |
+
" def batchTrain(model):\n",
|
| 559 |
+
" \"\"\"\n",
|
| 560 |
+
" train with batch data\n",
|
| 561 |
+
"\n",
|
| 562 |
+
" Parameters\n",
|
| 563 |
+
" model : torch model\n",
|
| 564 |
+
" \"\"\"\n",
|
| 565 |
+
" model.restored = False\n",
|
| 566 |
+
" trainData = TensorDataset(model.featData, model.outData)\n",
|
| 567 |
+
" trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)\n",
|
| 568 |
+
" epochIntv = model.config.getIntConfig(\"train.epoch.intv\")[0]\n",
|
| 569 |
+
"\n",
|
| 570 |
+
" # train mode\n",
|
| 571 |
+
" model.train()\n",
|
| 572 |
+
"\n",
|
| 573 |
+
" if model.trackErr:\n",
|
| 574 |
+
" trErr = list()\n",
|
| 575 |
+
" vaErr = list()\n",
|
| 576 |
+
" #epoch\n",
|
| 577 |
+
" for t in range(model.numIter):\n",
|
| 578 |
+
" #batch\n",
|
| 579 |
+
" b = 0\n",
|
| 580 |
+
" epochLoss = 0.0\n",
|
| 581 |
+
" for xBatch, yBatch in trainDataLoader:\n",
|
| 582 |
+
"\n",
|
| 583 |
+
" # Forward pass: Compute predicted y by passing x to the model\n",
|
| 584 |
+
" xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)\n",
|
| 585 |
+
" yPred = model(xBatch)\n",
|
| 586 |
+
"\n",
|
| 587 |
+
" # Compute and print loss\n",
|
| 588 |
+
" loss = model.lossFn(yPred, yBatch)\n",
|
| 589 |
+
" if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:\n",
|
| 590 |
+
" print(\"epoch {} batch {} loss {:.6f}\".format(t, b, loss.item()))\n",
|
| 591 |
+
"\n",
|
| 592 |
+
" if model.trackErr and model.batchIntv == 0:\n",
|
| 593 |
+
" epochLoss += loss.item()\n",
|
| 594 |
+
"\n",
|
| 595 |
+
" #error tracking at batch level\n",
|
| 596 |
+
" if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:\n",
|
| 597 |
+
" trErr.append(loss.item())\n",
|
| 598 |
+
" vloss = FeedForwardNetwork.evaluateModel(model)\n",
|
| 599 |
+
" vaErr.append(vloss)\n",
|
| 600 |
+
"\n",
|
| 601 |
+
" # Zero gradients, perform a backward pass, and update the weights.\n",
|
| 602 |
+
" model.optimizer.zero_grad()\n",
|
| 603 |
+
" loss.backward()\n",
|
| 604 |
+
" model.optimizer.step() \t\n",
|
| 605 |
+
" b += 1\n",
|
| 606 |
+
"\n",
|
| 607 |
+
" #error tracking at epoch level\n",
|
| 608 |
+
" if model.trackErr and model.batchIntv == 0:\n",
|
| 609 |
+
" epochLoss /= len(trainDataLoader)\n",
|
| 610 |
+
" trErr.append(epochLoss)\n",
|
| 611 |
+
" vloss = FeedForwardNetwork.evaluateModel(model)\n",
|
| 612 |
+
" vaErr.append(vloss)\n",
|
| 613 |
+
"\n",
|
| 614 |
+
" #validate\n",
|
| 615 |
+
" model.eval()\n",
|
| 616 |
+
" yPred = model(model.validFeatData)\n",
|
| 617 |
+
" yPred = yPred.data.cpu().numpy()\n",
|
| 618 |
+
" yActual = model.validOutData\n",
|
| 619 |
+
" if model.verbose:\n",
|
| 620 |
+
" vsize = yPred.shape[0]\n",
|
| 621 |
+
" print(\"\\npredicted \\t\\t actual\")\n",
|
| 622 |
+
" for i in range(vsize):\n",
|
| 623 |
+
" print(str(yPred[i]) + \"\\t\" + str(yActual[i]))\n",
|
| 624 |
+
"\n",
|
| 625 |
+
" score = perfMetric(model.accMetric, yActual, yPred)\n",
|
| 626 |
+
" print(yActual)\n",
|
| 627 |
+
" print(yPred)\n",
|
| 628 |
+
" print(formatFloat(3, score, \"perf score\"))\n",
|
| 629 |
+
"\n",
|
| 630 |
+
" #save\n",
|
| 631 |
+
" modelSave = model.config.getBooleanConfig(\"train.model.save\")[0]\n",
|
| 632 |
+
" if modelSave:\n",
|
| 633 |
+
" FeedForwardNetwork.saveCheckpt(model)\n",
|
| 634 |
+
"\n",
|
| 635 |
+
" if model.trackErr:\n",
|
| 636 |
+
" FeedForwardNetwork.errorPlot(model, trErr, vaErr)\n",
|
| 637 |
+
"\n",
|
| 638 |
+
" if model.config.getBooleanConfig(\"train.print.weights\")[0]:\n",
|
| 639 |
+
" print(\"model weights\")\n",
|
| 640 |
+
" for param in model.parameters():\n",
|
| 641 |
+
" print(param.data)\n",
|
| 642 |
+
" return score\n",
|
| 643 |
+
"\n",
|
| 644 |
+
" @staticmethod\n",
|
| 645 |
+
" def errorPlot(model, trErr, vaErr):\n",
|
| 646 |
+
" \"\"\"\n",
|
| 647 |
+
" plot errors\n",
|
| 648 |
+
"\n",
|
| 649 |
+
" Parameters\n",
|
| 650 |
+
" trErr : training error list\t\n",
|
| 651 |
+
" vaErr : validation error list\t\n",
|
| 652 |
+
" \"\"\"\n",
|
| 653 |
+
" x = np.arange(len(trErr))\n",
|
| 654 |
+
" plt.plot(x,trErr,label = \"training error\")\n",
|
| 655 |
+
" plt.plot(x,vaErr,label = \"validation error\")\n",
|
| 656 |
+
" plt.xlabel(\"iteration\")\n",
|
| 657 |
+
" plt.ylabel(\"error\")\n",
|
| 658 |
+
" plt.legend([\"training error\", \"validation error\"], loc='upper left')\n",
|
| 659 |
+
" plt.show()\n",
|
| 660 |
+
"\n",
|
| 661 |
+
" @staticmethod\n",
|
| 662 |
+
" def modelPredict(model, dataSource = None):\n",
|
| 663 |
+
" \"\"\"\n",
|
| 664 |
+
" predict\n",
|
| 665 |
+
"\n",
|
| 666 |
+
" Parameters\n",
|
| 667 |
+
" model : torch model\n",
|
| 668 |
+
" dataSource : data source\n",
|
| 669 |
+
" \"\"\"\n",
|
| 670 |
+
" #train or restore model\n",
|
| 671 |
+
" useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
|
| 672 |
+
" if useSavedModel:\n",
|
| 673 |
+
" FeedForwardNetwork.restoreCheckpt(model)\n",
|
| 674 |
+
" else:\n",
|
| 675 |
+
" FeedForwardNetwork.batchTrain(model) \n",
|
| 676 |
+
"\n",
|
| 677 |
+
" #predict\n",
|
| 678 |
+
" if dataSource is None:\n",
|
| 679 |
+
" dataSource = model.config.getStringConfig(\"predict.data.file\")[0]\n",
|
| 680 |
+
" featData = FeedForwardNetwork.prepData(model, dataSource, False)\n",
|
| 681 |
+
" #print(featData)\n",
|
| 682 |
+
" featData = torch.from_numpy(featData)\n",
|
| 683 |
+
" featData = featData.to(model.device)\n",
|
| 684 |
+
"\n",
|
| 685 |
+
" model.eval()\n",
|
| 686 |
+
" yPred = model(featData)\n",
|
| 687 |
+
" yPred = yPred.data.cpu().numpy()\n",
|
| 688 |
+
" #print(yPred)\n",
|
| 689 |
+
"\n",
|
| 690 |
+
" if model.outputSize >= 2:\n",
|
| 691 |
+
" #classification\n",
|
| 692 |
+
" yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)\n",
|
| 693 |
+
"\n",
|
| 694 |
+
" # print prediction\n",
|
| 695 |
+
" if model.config.getBooleanConfig(\"predict.print.output\")[0]:\n",
|
| 696 |
+
" FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)\n",
|
| 697 |
+
"\n",
|
| 698 |
+
" return yPred\n",
|
| 699 |
+
"\n",
|
| 700 |
+
" def predict(self, dataSource = None):\n",
|
| 701 |
+
" \"\"\"\n",
|
| 702 |
+
" predict\n",
|
| 703 |
+
"\n",
|
| 704 |
+
" Parameters\n",
|
| 705 |
+
" dataSource : data source\n",
|
| 706 |
+
" \"\"\"\n",
|
| 707 |
+
" return FeedForwardNetwork.modelPredict(self, dataSource)\n",
|
| 708 |
+
"\n",
|
| 709 |
+
" @staticmethod\n",
|
| 710 |
+
" def evaluateModel(model):\n",
|
| 711 |
+
" \"\"\"\n",
|
| 712 |
+
" evaluate model\n",
|
| 713 |
+
"\n",
|
| 714 |
+
" Parameters\n",
|
| 715 |
+
" model : torch model\n",
|
| 716 |
+
" \"\"\"\n",
|
| 717 |
+
" model.eval()\n",
|
| 718 |
+
" with torch.no_grad():\n",
|
| 719 |
+
" yPred = model(model.validFeatData)\n",
|
| 720 |
+
" #yPred = yPred.data.cpu().numpy()\n",
|
| 721 |
+
" yActual = model.validOutData\n",
|
| 722 |
+
" score = model.lossFn(yPred, yActual).item()\n",
|
| 723 |
+
" model.train()\n",
|
| 724 |
+
" return score\n",
|
| 725 |
+
"\n",
|
| 726 |
+
" @staticmethod\n",
|
| 727 |
+
" def prepValidate(model, dataSource=None):\n",
|
| 728 |
+
" \"\"\"\n",
|
| 729 |
+
" prepare for validation\n",
|
| 730 |
+
"\n",
|
| 731 |
+
" Parameters\n",
|
| 732 |
+
" model : torch model\n",
|
| 733 |
+
" dataSource : data source\n",
|
| 734 |
+
" \"\"\"\n",
|
| 735 |
+
" #train or restore model\n",
|
| 736 |
+
" if not model.restored:\n",
|
| 737 |
+
" useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
|
| 738 |
+
" if useSavedModel:\n",
|
| 739 |
+
" FeedForwardNetwork.restoreCheckpt(model)\n",
|
| 740 |
+
" else:\n",
|
| 741 |
+
" FeedForwardNetwork.batchTrain(model)\n",
|
| 742 |
+
" model.restored = True\n",
|
| 743 |
+
"\n",
|
| 744 |
+
" if \tdataSource is not None:\n",
|
| 745 |
+
" model.setValidationData(dataSource)\n",
|
| 746 |
+
"\n",
|
| 747 |
+
" @staticmethod\n",
|
| 748 |
+
" def validateModel(model, retPred=False):\n",
|
| 749 |
+
" \"\"\"\n",
|
| 750 |
+
" pmodel validation\n",
|
| 751 |
+
"\n",
|
| 752 |
+
" Parameters\n",
|
| 753 |
+
" model : torch model\n",
|
| 754 |
+
" retPred : if True return prediction\n",
|
| 755 |
+
" \"\"\"\n",
|
| 756 |
+
" model.eval()\n",
|
| 757 |
+
" yPred = model(model.validFeatData)\n",
|
| 758 |
+
" yPred = yPred.data.cpu().numpy()\n",
|
| 759 |
+
" model.yPred = yPred\n",
|
| 760 |
+
" yActual = model.validOutData\n",
|
| 761 |
+
" vsize = yPred.shape[0]\n",
|
| 762 |
+
" if model.verbose:\n",
|
| 763 |
+
" print(\"\\npredicted \\t actual\")\n",
|
| 764 |
+
" for i in range(vsize):\n",
|
| 765 |
+
" print(\"{:.3f}\\t\\t{:.3f}\".format(yPred[i][0], yActual[i][0]))\n",
|
| 766 |
+
"\n",
|
| 767 |
+
" score = perfMetric(model.accMetric, yActual, yPred)\n",
|
| 768 |
+
" print(formatFloat(3, score, \"perf score\"))\n",
|
| 769 |
+
"\n",
|
| 770 |
+
" if retPred:\n",
|
| 771 |
+
" y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))\n",
|
| 772 |
+
" res = (y, score)\n",
|
| 773 |
+
" return res\n",
|
| 774 |
+
" else:\t\n",
|
| 775 |
+
" return score"
|
| 776 |
+
]
|
| 777 |
+
}
|
| 778 |
+
],
|
| 779 |
+
"metadata": {
|
| 780 |
+
"kernelspec": {
|
| 781 |
+
"display_name": "Python 3 (ipykernel)",
|
| 782 |
+
"language": "python",
|
| 783 |
+
"name": "python3"
|
| 784 |
+
},
|
| 785 |
+
"language_info": {
|
| 786 |
+
"codemirror_mode": {
|
| 787 |
+
"name": "ipython",
|
| 788 |
+
"version": 3
|
| 789 |
+
},
|
| 790 |
+
"file_extension": ".py",
|
| 791 |
+
"mimetype": "text/x-python",
|
| 792 |
+
"name": "python",
|
| 793 |
+
"nbconvert_exporter": "python",
|
| 794 |
+
"pygments_lexer": "ipython3",
|
| 795 |
+
"version": "3.9.12"
|
| 796 |
+
}
|
| 797 |
+
},
|
| 798 |
+
"nbformat": 4,
|
| 799 |
+
"nbformat_minor": 5
|
| 800 |
+
}
|
lib/txproc.ipynb
ADDED
|
@@ -0,0 +1,1002 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "f720c141",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"from random import randint\n",
|
| 13 |
+
"import random\n",
|
| 14 |
+
"import time\n",
|
| 15 |
+
"from datetime import datetime\n",
|
| 16 |
+
"import re, string, unicodedata\n",
|
| 17 |
+
"import nltk\n",
|
| 18 |
+
"import contractions\n",
|
| 19 |
+
"import inflect\n",
|
| 20 |
+
"from bs4 import BeautifulSoup\n",
|
| 21 |
+
"from nltk import word_tokenize, sent_tokenize\n",
|
| 22 |
+
"from nltk.corpus import stopwords\n",
|
| 23 |
+
"from nltk.stem.isri import ISRIStemmer\n",
|
| 24 |
+
"from nltk.stem.porter import PorterStemmer\n",
|
| 25 |
+
"from nltk.stem.snowball import SnowballStemmer\n",
|
| 26 |
+
"from nltk.stem import LancasterStemmer, WordNetLemmatizer\n",
|
| 27 |
+
"from nltk.tag import StanfordNERTagger\n",
|
| 28 |
+
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
|
| 29 |
+
"import spacy\n",
|
| 30 |
+
"import torch\n",
|
| 31 |
+
"from collections import defaultdict\n",
|
| 32 |
+
"import pickle\n",
|
| 33 |
+
"import numpy as np\n",
|
| 34 |
+
"import re\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"sys.path.append(os.path.abspath(\"../lib\"))\n",
|
| 37 |
+
"from util import *\n",
|
| 38 |
+
"from mlutil import *\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"lcc = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
|
| 41 |
+
"\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
|
| 42 |
+
"ucc = [\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\", \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\"]\n",
|
| 43 |
+
"dig = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
|
| 44 |
+
"spc = [\"@\",\"#\",\"$\",\"%\",\"^\",\"&\",\"*\",\"(\",\")\",\"_\",\"+\",\"{\",\"}\",\"[\",\"]\",\"|\",\":\",\"<\",\">\",\"?\",\";\",\",\",\".\"]\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"class TextPreProcessor:\n",
|
| 48 |
+
" \"\"\"\n",
|
| 49 |
+
" text preprocessor\n",
|
| 50 |
+
" \"\"\"\n",
|
| 51 |
+
" def __init__(self, stemmer = \"lancaster\", verbose=False):\n",
|
| 52 |
+
" self.verbose = verbose\n",
|
| 53 |
+
" self.lemmatizer = WordNetLemmatizer()\n",
|
| 54 |
+
"\n",
|
| 55 |
+
" def stripHtml(self, text):\n",
|
| 56 |
+
" soup = BeautifulSoup(text, \"html.parser\")\n",
|
| 57 |
+
" return soup.get_text()\n",
|
| 58 |
+
"\n",
|
| 59 |
+
" def removeBetweenSquareBrackets(self, text):\n",
|
| 60 |
+
" return re.sub('\\[[^]]*\\]', '', text)\n",
|
| 61 |
+
"\n",
|
| 62 |
+
" def denoiseText(self, text):\n",
|
| 63 |
+
" text = stripHtml(text)\n",
|
| 64 |
+
" text = removeBetweenSquareBrackets(text)\n",
|
| 65 |
+
" return text\n",
|
| 66 |
+
"\n",
|
| 67 |
+
" def replaceContractions(self, text):\n",
|
| 68 |
+
" \"\"\"Replace contractions in string of text\"\"\"\n",
|
| 69 |
+
" return contractions.fix(text)\n",
|
| 70 |
+
"\n",
|
| 71 |
+
" def tokenize(self, text):\n",
|
| 72 |
+
" words = nltk.word_tokenize(text)\n",
|
| 73 |
+
" return words\n",
|
| 74 |
+
"\n",
|
| 75 |
+
" def removeNonAscii(self, words):\n",
|
| 76 |
+
" \"\"\"Remove non-ASCII characters from list of tokenized words\"\"\"\n",
|
| 77 |
+
" newWords = []\n",
|
| 78 |
+
" for word in words:\n",
|
| 79 |
+
" if isinstance(word, unicode):\n",
|
| 80 |
+
" newWord = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')\n",
|
| 81 |
+
" else:\n",
|
| 82 |
+
" newWord = word\n",
|
| 83 |
+
" newWords.append(newWord)\n",
|
| 84 |
+
" return newWords\n",
|
| 85 |
+
"\n",
|
| 86 |
+
" def replaceNonAsciiFromText(self, text):\n",
|
| 87 |
+
" \"\"\" replaces non ascii with blank \"\"\"\n",
|
| 88 |
+
" return ''.join([i if ord(i) < 128 else ' ' for i in text])\n",
|
| 89 |
+
"\n",
|
| 90 |
+
" def removeNonAsciiFromText(self, text):\n",
|
| 91 |
+
" \"\"\" replaces non ascii with blank \"\"\"\n",
|
| 92 |
+
" return ''.join([i if ord(i) < 128 else '' for i in text])\n",
|
| 93 |
+
"\n",
|
| 94 |
+
" def allow(self, words):\n",
|
| 95 |
+
" \"\"\" allow only specific charaters \"\"\"\n",
|
| 96 |
+
" allowed = [word for word in words if re.match('^[A-Za-z0-9\\.\\,\\:\\;\\!\\?\\(\\)\\'\\-\\$\\@\\%\\\"]+$', word) is not None]\t\t\n",
|
| 97 |
+
" return allowed\t\t\n",
|
| 98 |
+
"\n",
|
| 99 |
+
" def toLowercase(self, words):\n",
|
| 100 |
+
" \"\"\"Convert all characters to lowercase from list of tokenized words\"\"\"\n",
|
| 101 |
+
" newWords = [word.lower() for word in words]\n",
|
| 102 |
+
" return newWords\n",
|
| 103 |
+
"\n",
|
| 104 |
+
" def removePunctuation(self, words):\n",
|
| 105 |
+
" \"\"\"Remove punctuation from list of tokenized words\"\"\"\n",
|
| 106 |
+
" newWords = []\n",
|
| 107 |
+
" for word in words:\n",
|
| 108 |
+
" newWord = re.sub(r'[^\\w\\s]', '', word)\n",
|
| 109 |
+
" if newWord != '':\n",
|
| 110 |
+
" newWords.append(newWord)\n",
|
| 111 |
+
" return newWords\n",
|
| 112 |
+
"\n",
|
| 113 |
+
" def replaceNumbers(self, words):\n",
|
| 114 |
+
" \"\"\"Replace all interger occurrences in list of tokenized words with textual representation\"\"\"\n",
|
| 115 |
+
" p = inflect.engine()\n",
|
| 116 |
+
" newWords = []\n",
|
| 117 |
+
" for word in words:\n",
|
| 118 |
+
" if word.isdigit():\n",
|
| 119 |
+
" newWord = p.number_to_words(word)\n",
|
| 120 |
+
" newWords.append(newWord)\n",
|
| 121 |
+
" else:\n",
|
| 122 |
+
" newWords.append(word)\n",
|
| 123 |
+
" return newWords\n",
|
| 124 |
+
"\n",
|
| 125 |
+
" def removeStopwords(self, words):\n",
|
| 126 |
+
" \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
|
| 127 |
+
" newWords = []\n",
|
| 128 |
+
" for word in words:\n",
|
| 129 |
+
" if word not in stopwords.words('english'):\n",
|
| 130 |
+
" newWords.append(word)\n",
|
| 131 |
+
" return newWords\n",
|
| 132 |
+
"\n",
|
| 133 |
+
" def removeCustomStopwords(self, words, stopWords):\n",
|
| 134 |
+
" \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
|
| 135 |
+
" removed = [word for word in words if word not in stopWords]\t\t\n",
|
| 136 |
+
" return removed\n",
|
| 137 |
+
"\n",
|
| 138 |
+
" def removeLowFreqWords(self, words, minFreq):\n",
|
| 139 |
+
" \"\"\"Remove low frewquncy words from list of tokenized words\"\"\"\n",
|
| 140 |
+
" frequency = defaultdict(int)\n",
|
| 141 |
+
" for word in words:\n",
|
| 142 |
+
" frequency[word] += 1\n",
|
| 143 |
+
" removed = [word for word in words if frequency[word] > minFreq]\t\t\n",
|
| 144 |
+
" return removed\t\n",
|
| 145 |
+
"\n",
|
| 146 |
+
" def removeNumbers(self, words):\n",
|
| 147 |
+
" \"\"\"Remove numbers\"\"\"\n",
|
| 148 |
+
" removed = [word for word in words if not isNumber(word)]\t\t\n",
|
| 149 |
+
" return removed\t\t\n",
|
| 150 |
+
"\n",
|
| 151 |
+
" def removeShortWords(self, words, minLengh):\n",
|
| 152 |
+
" \"\"\"Remove short words \"\"\"\n",
|
| 153 |
+
" removed = [word for word in words if len(word) >= minLengh]\t\t\n",
|
| 154 |
+
" return removed\t\t\n",
|
| 155 |
+
"\n",
|
| 156 |
+
" def keepAllowedWords(self, words, keepWords):\n",
|
| 157 |
+
" \"\"\"Keep words from the list only\"\"\"\n",
|
| 158 |
+
" kept = [word for word in words if word in keepWords]\t\t\n",
|
| 159 |
+
" return kept\n",
|
| 160 |
+
"\n",
|
| 161 |
+
" def stemWords(self, words):\n",
|
| 162 |
+
" \"\"\"Stem words in list of tokenized words\"\"\"\n",
|
| 163 |
+
" if stemmer == \"lancaster\":\n",
|
| 164 |
+
" stemmer = LancasterStemmer()\n",
|
| 165 |
+
" elif stemmer == \"snowbal\":\n",
|
| 166 |
+
" stemmer = SnowballStemmer()\n",
|
| 167 |
+
" elif stemmer == \"porter\":\n",
|
| 168 |
+
" stemmer = PorterStemmer()\n",
|
| 169 |
+
" stems = [stemmer.stem(word) for word in words]\n",
|
| 170 |
+
" return stems\n",
|
| 171 |
+
"\n",
|
| 172 |
+
" def lemmatizeWords(self, words):\n",
|
| 173 |
+
" \"\"\"Lemmatize tokens in list of tokenized words\"\"\"\n",
|
| 174 |
+
" lemmas = [self.lemmatizer.lemmatize(word) for word in words]\n",
|
| 175 |
+
" return lemmas\n",
|
| 176 |
+
"\n",
|
| 177 |
+
" def lemmatizeVerbs(self, words):\n",
|
| 178 |
+
" \"\"\"Lemmatize verbs in list of tokenized words\"\"\"\n",
|
| 179 |
+
" lemmas = [self.lemmatizer.lemmatize(word, pos='v') for word in words]\n",
|
| 180 |
+
" return lemmas\n",
|
| 181 |
+
"\n",
|
| 182 |
+
" def normalize(self, words):\n",
|
| 183 |
+
" words = self.removeNonAscii(words)\n",
|
| 184 |
+
" words = self.toLowercase(words)\n",
|
| 185 |
+
" words = self.removePunctuation(words)\n",
|
| 186 |
+
" words = self.replaceNumbers(words)\n",
|
| 187 |
+
" words = self.removeStopwords(words)\n",
|
| 188 |
+
" return words\n",
|
| 189 |
+
"\n",
|
| 190 |
+
" def posTag(self, textTokens):\n",
|
| 191 |
+
" tags = nltk.pos_tag(textTokens)\n",
|
| 192 |
+
" return tags\n",
|
| 193 |
+
"\n",
|
| 194 |
+
" def extractEntity(self, textTokens, classifierPath, jarPath):\n",
|
| 195 |
+
" st = StanfordNERTagger(classifierPath, jarPath) \n",
|
| 196 |
+
" entities = st.tag(textTokens)\n",
|
| 197 |
+
" return entities\n",
|
| 198 |
+
"\n",
|
| 199 |
+
" def documentFeatures(self, document, wordFeatures):\n",
|
| 200 |
+
" documentWords = set(document)\n",
|
| 201 |
+
" features = {}\n",
|
| 202 |
+
" for word in wordFeatures:\n",
|
| 203 |
+
" features[word] = (word in documentWords)\n",
|
| 204 |
+
" return features\n",
|
| 205 |
+
"\n",
|
| 206 |
+
"class NGram:\n",
|
| 207 |
+
" \"\"\"\n",
|
| 208 |
+
" word ngram\n",
|
| 209 |
+
" \"\"\"\n",
|
| 210 |
+
" def __init__(self, vocFilt, verbose=False):\n",
|
| 211 |
+
" \"\"\"\n",
|
| 212 |
+
" initialize\n",
|
| 213 |
+
" \"\"\"\n",
|
| 214 |
+
" self.vocFilt = vocFilt\n",
|
| 215 |
+
" self.nGramCounter = dict()\n",
|
| 216 |
+
" self.nGramFreq = dict()\n",
|
| 217 |
+
" self.corpSize = 0\n",
|
| 218 |
+
" self.vocabulary = set()\n",
|
| 219 |
+
" self.freqDone = False\n",
|
| 220 |
+
" self.verbose = verbose\n",
|
| 221 |
+
" self.vecWords = None\n",
|
| 222 |
+
" self.nonZeroCount = 0\n",
|
| 223 |
+
"\n",
|
| 224 |
+
" def countDocNGrams(self, words):\n",
|
| 225 |
+
" \"\"\"\n",
|
| 226 |
+
" count words in a doc\n",
|
| 227 |
+
" \"\"\"\n",
|
| 228 |
+
" if self.verbose:\n",
|
| 229 |
+
" print (\"doc size \" + str(len(words)))\n",
|
| 230 |
+
" nGrams = self.toNGram(words)\n",
|
| 231 |
+
" for nGram in nGrams:\n",
|
| 232 |
+
" count = self.nGramCounter.get(nGram, 0)\n",
|
| 233 |
+
" self.nGramCounter[nGram] = count + 1\n",
|
| 234 |
+
" self.corpSize += 1\n",
|
| 235 |
+
" self.vocabulary.update(words)\t\n",
|
| 236 |
+
"\n",
|
| 237 |
+
" def remLowCount(self, minCount):\n",
|
| 238 |
+
" \"\"\"\n",
|
| 239 |
+
" removes items with count below threshold\n",
|
| 240 |
+
" \"\"\"\n",
|
| 241 |
+
" self.nGramCounter = dict(filter(lambda item: item[1] >= minCount, self.nGramCounter.items()))\n",
|
| 242 |
+
"\n",
|
| 243 |
+
" def getVocabSize(self):\n",
|
| 244 |
+
" \"\"\"\n",
|
| 245 |
+
" get vocabulary size\n",
|
| 246 |
+
" \"\"\"\n",
|
| 247 |
+
" return len(self.nGramCounter)\n",
|
| 248 |
+
"\n",
|
| 249 |
+
" def getNGramFreq(self):\n",
|
| 250 |
+
" \"\"\"\n",
|
| 251 |
+
" get normalized count\n",
|
| 252 |
+
" \"\"\"\n",
|
| 253 |
+
" if self.verbose:\n",
|
| 254 |
+
" print (\"counter size \" + str(len(self.nGramCounter)))\n",
|
| 255 |
+
" if not self.freqDone:\n",
|
| 256 |
+
" for item in self.nGramCounter.items():\n",
|
| 257 |
+
" self.nGramFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
|
| 258 |
+
" self.freqDone = True\n",
|
| 259 |
+
" return self.nGramFreq\n",
|
| 260 |
+
"\n",
|
| 261 |
+
" def getNGramIndex(self, show):\n",
|
| 262 |
+
" \"\"\"\n",
|
| 263 |
+
" convert to list\n",
|
| 264 |
+
" \"\"\"\n",
|
| 265 |
+
" if self.vecWords is None:\n",
|
| 266 |
+
" self.vecWords = list(self.nGramCounter)\n",
|
| 267 |
+
" if show:\n",
|
| 268 |
+
" for vw in enumerate(self.vecWords):\n",
|
| 269 |
+
" print(vw)\n",
|
| 270 |
+
"\n",
|
| 271 |
+
" def getVector(self, words, byCount, normalized):\n",
|
| 272 |
+
" \"\"\"\n",
|
| 273 |
+
" convert to vector\n",
|
| 274 |
+
" \"\"\"\n",
|
| 275 |
+
" if self.vecWords is None:\n",
|
| 276 |
+
" self.vecWords = list(self.nGramCounter)\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" nGrams = self.toNGram(words)\n",
|
| 279 |
+
" if self.verbose:\n",
|
| 280 |
+
" print(\"vocabulary size {}\".format(len(self.vecWords)))\n",
|
| 281 |
+
" print(\"ngrams\")\n",
|
| 282 |
+
" print(nGrams)\n",
|
| 283 |
+
" self.nonZeroCount = 0\n",
|
| 284 |
+
" vec = list(map(lambda vw: self.getVecElem(vw, nGrams, byCount, normalized), self.vecWords))\n",
|
| 285 |
+
" return vec\n",
|
| 286 |
+
"\n",
|
| 287 |
+
" def getVecElem(self, vw, nGrams, byCount, normalized):\n",
|
| 288 |
+
" \"\"\"\n",
|
| 289 |
+
" get vector element\n",
|
| 290 |
+
" \"\"\"\n",
|
| 291 |
+
" if vw in nGrams:\n",
|
| 292 |
+
" if byCount:\n",
|
| 293 |
+
" if normalized:\n",
|
| 294 |
+
" el = self.nGramFreq[vw]\n",
|
| 295 |
+
" else:\n",
|
| 296 |
+
" el = self.nGramCounter[vw]\n",
|
| 297 |
+
" else:\n",
|
| 298 |
+
" el = 1\n",
|
| 299 |
+
" self.nonZeroCount += 1\n",
|
| 300 |
+
" else:\n",
|
| 301 |
+
" if (byCount and normalized):\n",
|
| 302 |
+
" el = 0.0\n",
|
| 303 |
+
" else:\n",
|
| 304 |
+
" el = 0\n",
|
| 305 |
+
" return el\n",
|
| 306 |
+
"\n",
|
| 307 |
+
" def getNonZeroCount(self):\n",
|
| 308 |
+
" \"\"\"\n",
|
| 309 |
+
" get non zero vector element count\n",
|
| 310 |
+
" \"\"\"\n",
|
| 311 |
+
" return self.nonZeroCount\n",
|
| 312 |
+
"\n",
|
| 313 |
+
" def toBiGram(self, words):\n",
|
| 314 |
+
" \"\"\"\n",
|
| 315 |
+
" convert to bigram\n",
|
| 316 |
+
" \"\"\"\n",
|
| 317 |
+
" if self.verbose:\n",
|
| 318 |
+
" print (\"doc size \" + str(len(words)))\n",
|
| 319 |
+
" biGrams = list()\n",
|
| 320 |
+
" for i in range(len(words)-1):\n",
|
| 321 |
+
" w1 = words[i]\n",
|
| 322 |
+
" w2 = words[i+1]\n",
|
| 323 |
+
" if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt):\n",
|
| 324 |
+
" nGram = (w1, w2)\n",
|
| 325 |
+
" biGrams.append(nGram)\n",
|
| 326 |
+
" return biGrams\n",
|
| 327 |
+
"\n",
|
| 328 |
+
" def toTriGram(self, words):\n",
|
| 329 |
+
" \"\"\"\n",
|
| 330 |
+
" convert to trigram\n",
|
| 331 |
+
" \"\"\"\n",
|
| 332 |
+
" if self.verbose:\n",
|
| 333 |
+
" print (\"doc size \" + str(len(words)))\n",
|
| 334 |
+
" triGrams = list()\n",
|
| 335 |
+
" for i in range(len(words)-2):\n",
|
| 336 |
+
" w1 = words[i]\n",
|
| 337 |
+
" w2 = words[i+1]\n",
|
| 338 |
+
" w3 = words[i+2]\n",
|
| 339 |
+
" if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt and w3 in self.vocFilt):\n",
|
| 340 |
+
" nGram = (w1, w2, w3)\n",
|
| 341 |
+
" triGrams.append(nGram)\n",
|
| 342 |
+
" return triGrams\n",
|
| 343 |
+
"\n",
|
| 344 |
+
" def save(self, saveFile):\n",
|
| 345 |
+
" \"\"\"\n",
|
| 346 |
+
" save \n",
|
| 347 |
+
" \"\"\"\n",
|
| 348 |
+
" sf = open(saveFile, \"wb\")\n",
|
| 349 |
+
" pickle.dump(self, sf)\n",
|
| 350 |
+
" sf.close()\n",
|
| 351 |
+
"\n",
|
| 352 |
+
" @staticmethod\n",
|
| 353 |
+
" def load(saveFile):\n",
|
| 354 |
+
" \"\"\"\n",
|
| 355 |
+
" load\n",
|
| 356 |
+
" \"\"\"\n",
|
| 357 |
+
" sf = open(saveFile, \"rb\")\n",
|
| 358 |
+
" nGrams = pickle.load(sf)\n",
|
| 359 |
+
" sf.close()\n",
|
| 360 |
+
" return nGrams\n",
|
| 361 |
+
"\n",
|
| 362 |
+
"class CharNGram:\n",
|
| 363 |
+
" \"\"\"\n",
|
| 364 |
+
" character n gram\n",
|
| 365 |
+
" \"\"\"\n",
|
| 366 |
+
" def __init__(self, domains, ngsize, verbose=False):\n",
|
| 367 |
+
" \"\"\"\n",
|
| 368 |
+
" initialize\n",
|
| 369 |
+
" \"\"\"\n",
|
| 370 |
+
" self.chDomain = list()\n",
|
| 371 |
+
" self.ws = \"#\"\n",
|
| 372 |
+
" self.chDomain.append(self.ws)\n",
|
| 373 |
+
" for d in domains:\n",
|
| 374 |
+
" if d == \"lcc\":\n",
|
| 375 |
+
" self.chDomain.extend(lcc)\n",
|
| 376 |
+
" elif d == \"ucc\":\n",
|
| 377 |
+
" self.chDomain.extend(ucc)\n",
|
| 378 |
+
" elif d == \"dig\":\n",
|
| 379 |
+
" self.chDomain.extend(dig)\n",
|
| 380 |
+
" elif d == \"spc\":\n",
|
| 381 |
+
" self.chDomain.extend(spc)\n",
|
| 382 |
+
" else:\n",
|
| 383 |
+
" raise ValueError(\"invalid character type \" + d)\n",
|
| 384 |
+
"\n",
|
| 385 |
+
" self.ngsize = ngsize\n",
|
| 386 |
+
" self.radixPow = None\n",
|
| 387 |
+
" self.cntVecSize = None\n",
|
| 388 |
+
"\n",
|
| 389 |
+
" def addSpChar(self, spChar):\n",
|
| 390 |
+
" \"\"\"\n",
|
| 391 |
+
" add special characters\n",
|
| 392 |
+
" \"\"\"\n",
|
| 393 |
+
" self.chDomain.extend(spChar)\n",
|
| 394 |
+
"\n",
|
| 395 |
+
" def setWsRepl(self, ws):\n",
|
| 396 |
+
" \"\"\"\n",
|
| 397 |
+
" set white space replacement charater\n",
|
| 398 |
+
" \"\"\"\n",
|
| 399 |
+
" self.ws = ws\n",
|
| 400 |
+
" self.chDomain[0] = self.ws\n",
|
| 401 |
+
"\n",
|
| 402 |
+
" def finalize(self):\n",
|
| 403 |
+
" \"\"\"\n",
|
| 404 |
+
" final setup\n",
|
| 405 |
+
" \"\"\"\t\t\n",
|
| 406 |
+
" domSize = len(self.chDomain)\n",
|
| 407 |
+
" self.cntVecSize = int(math.pow(domSize, self.ngsize))\n",
|
| 408 |
+
" if self.radixPow is None:\n",
|
| 409 |
+
" self.radixPow = list()\n",
|
| 410 |
+
" for i in range(self.ngsize-1, 0, -1):\n",
|
| 411 |
+
" self.radixPow.append(int(math.pow(domSize, i)))\n",
|
| 412 |
+
" self.radixPow.append(1)\n",
|
| 413 |
+
"\n",
|
| 414 |
+
"\n",
|
| 415 |
+
" def toMgramCount(self, text):\n",
|
| 416 |
+
" \"\"\"\n",
|
| 417 |
+
" get ngram count list\n",
|
| 418 |
+
" \"\"\"\n",
|
| 419 |
+
" #print(text)\n",
|
| 420 |
+
" ngCounts = [0] * self.cntVecSize\n",
|
| 421 |
+
"\n",
|
| 422 |
+
" ngram = list()\n",
|
| 423 |
+
" totNgCount = 0\n",
|
| 424 |
+
" for ch in text:\n",
|
| 425 |
+
" if ch.isspace():\n",
|
| 426 |
+
" l = len(ngram)\n",
|
| 427 |
+
" if l == 0 or ngram[l-1] != self.ws:\n",
|
| 428 |
+
" ngram.append(self.ws)\n",
|
| 429 |
+
" else:\n",
|
| 430 |
+
" ngram.append(ch)\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" if len(ngram) == self.ngsize:\n",
|
| 433 |
+
" i = self.__getNgramIndex(ngram)\n",
|
| 434 |
+
" assert i < self.cntVecSize, \"ngram index out of range index \" + str(i) + \" size \" + str(self.cntVecSize) \n",
|
| 435 |
+
" ngCounts[i] += 1\n",
|
| 436 |
+
" ngram.clear()\n",
|
| 437 |
+
" totNgCount += 1\n",
|
| 438 |
+
"\n",
|
| 439 |
+
" return ngCounts\n",
|
| 440 |
+
"\n",
|
| 441 |
+
" def __getNgramIndex(self, ngram):\n",
|
| 442 |
+
" \"\"\"\n",
|
| 443 |
+
" get index of an ngram into a list of size equal total number of possible ngrams\n",
|
| 444 |
+
" \"\"\"\n",
|
| 445 |
+
" assert len(ngram) == len(self.radixPow), \"ngram size mismatch\"\t\t\n",
|
| 446 |
+
" ngi = 0\n",
|
| 447 |
+
" for ch, rp in zip(ngram, self.radixPow):\n",
|
| 448 |
+
" i = self.chDomain.index(ch)\n",
|
| 449 |
+
" ngi += i * rp\n",
|
| 450 |
+
"\n",
|
| 451 |
+
" return ngi\n",
|
| 452 |
+
"\n",
|
| 453 |
+
"\n",
|
| 454 |
+
"class TfIdf:\n",
|
| 455 |
+
" \"\"\"\n",
|
| 456 |
+
" TF IDF\t\n",
|
| 457 |
+
" \"\"\"\n",
|
| 458 |
+
" def __init__(self, vocFilt, doIdf, verbose=False):\n",
|
| 459 |
+
" \"\"\"\n",
|
| 460 |
+
" initialize\n",
|
| 461 |
+
" \"\"\"\n",
|
| 462 |
+
" self.vocFilt = vocFilt\n",
|
| 463 |
+
" self.doIdf = doIdf\n",
|
| 464 |
+
" self.wordCounter = {}\n",
|
| 465 |
+
" self.wordFreq = {}\n",
|
| 466 |
+
" self.wordInDocCount = {}\n",
|
| 467 |
+
" self.docCount = 0\n",
|
| 468 |
+
" self.corpSize = 0\n",
|
| 469 |
+
" self.freqDone = False\n",
|
| 470 |
+
" self.vocabulary = set()\n",
|
| 471 |
+
" self.wordIndex = None\n",
|
| 472 |
+
" self.verbose = verbose\n",
|
| 473 |
+
" self.vecWords = None\n",
|
| 474 |
+
"\n",
|
| 475 |
+
" def countDocWords(self, words):\n",
|
| 476 |
+
" \"\"\"\n",
|
| 477 |
+
" count words in a doc\n",
|
| 478 |
+
" \"\"\"\n",
|
| 479 |
+
" if self.verbose:\n",
|
| 480 |
+
" print (\"doc size \" + str(len(words)))\n",
|
| 481 |
+
" for word in words:\n",
|
| 482 |
+
" if self.vocFilt is None or word in self.vocFilt:\n",
|
| 483 |
+
" count = self.wordCounter.get(word, 0)\n",
|
| 484 |
+
" self.wordCounter[word] = count + 1\n",
|
| 485 |
+
" self.corpSize += len(words)\n",
|
| 486 |
+
" self.vocabulary.update(words)\n",
|
| 487 |
+
"\n",
|
| 488 |
+
" if (self.doIdf):\n",
|
| 489 |
+
" self.docCount += 1\n",
|
| 490 |
+
" for word in set(words):\n",
|
| 491 |
+
" self.wordInDocCount.get(word, 0)\n",
|
| 492 |
+
" self.wordInDocCount[word] = count + 1\n",
|
| 493 |
+
" self.freqDone = False\n",
|
| 494 |
+
"\n",
|
| 495 |
+
"\n",
|
| 496 |
+
" def getWordFreq(self):\n",
|
| 497 |
+
" \"\"\"\n",
|
| 498 |
+
" get tfidf for corpus\n",
|
| 499 |
+
" \"\"\"\n",
|
| 500 |
+
" if self.verbose:\n",
|
| 501 |
+
" print (\"counter size \" + str(len(self.wordCounter)))\n",
|
| 502 |
+
" if not self.freqDone:\n",
|
| 503 |
+
" for item in self.wordCounter.items():\n",
|
| 504 |
+
" self.wordFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
|
| 505 |
+
" if self.doIdf:\n",
|
| 506 |
+
" for k in self.wordFreq.keys():\n",
|
| 507 |
+
" self.wordFreq.items[k] *= math.log(self.docCount / self.wordInDocCount.items[k])\t\n",
|
| 508 |
+
" self.freqDone = True\n",
|
| 509 |
+
" return self.wordFreq\n",
|
| 510 |
+
"\n",
|
| 511 |
+
" def getCount(self, word):\n",
|
| 512 |
+
" \"\"\"\n",
|
| 513 |
+
" get counter\n",
|
| 514 |
+
" \"\"\"\n",
|
| 515 |
+
" if word in self.wordCounter:\n",
|
| 516 |
+
" count = self.wordCounter[word]\n",
|
| 517 |
+
" else:\n",
|
| 518 |
+
" raise ValueError(\"word not found in count table \" + word)\n",
|
| 519 |
+
" return count\n",
|
| 520 |
+
"\n",
|
| 521 |
+
" def getFreq(self, word):\n",
|
| 522 |
+
" \"\"\"\n",
|
| 523 |
+
" get normalized frequency\n",
|
| 524 |
+
" \"\"\"\n",
|
| 525 |
+
" if word in self.wordFreq:\n",
|
| 526 |
+
" freq = self.wordFreq[word]\n",
|
| 527 |
+
" else:\n",
|
| 528 |
+
" raise ValueError(\"word not found in count table \" + word)\n",
|
| 529 |
+
" return freq\n",
|
| 530 |
+
"\n",
|
| 531 |
+
" def resetCounter(self):\n",
|
| 532 |
+
" \"\"\"\n",
|
| 533 |
+
" reset counter\n",
|
| 534 |
+
" \"\"\"\n",
|
| 535 |
+
" self.wordCounter = {}\n",
|
| 536 |
+
"\n",
|
| 537 |
+
" def buildVocabulary(self, words):\n",
|
| 538 |
+
" \"\"\"\n",
|
| 539 |
+
" build vocbulary\n",
|
| 540 |
+
" \"\"\"\n",
|
| 541 |
+
" self.vocabulary.update(words)\n",
|
| 542 |
+
"\n",
|
| 543 |
+
" def getVocabulary(self):\n",
|
| 544 |
+
" \"\"\"\n",
|
| 545 |
+
" return vocabulary\n",
|
| 546 |
+
" \"\"\"\n",
|
| 547 |
+
" return self.vocabulary\n",
|
| 548 |
+
"\n",
|
| 549 |
+
" def creatWordIndex(self):\n",
|
| 550 |
+
" \"\"\"\n",
|
| 551 |
+
" index for all words in vcabulary\n",
|
| 552 |
+
" \"\"\"\n",
|
| 553 |
+
" self.wordIndex = {word : idx for idx, word in enumerate(list(self.vocabulary))}\n",
|
| 554 |
+
"\n",
|
| 555 |
+
" def getVector(self, words, byCount, normalized):\n",
|
| 556 |
+
" \"\"\"\n",
|
| 557 |
+
" get vector\n",
|
| 558 |
+
" \"\"\"\n",
|
| 559 |
+
" if self.vecWords is None:\n",
|
| 560 |
+
" self.vecWords = list(self.wordCounter)\n",
|
| 561 |
+
" vec = list(map(lambda vw: self.getVecElem(vw, words, byCount, normalized), self.vecWords))\n",
|
| 562 |
+
" return vec\n",
|
| 563 |
+
"\n",
|
| 564 |
+
" def getVecElem(self, vw, words, byCount, normalized):\n",
|
| 565 |
+
" \"\"\"\n",
|
| 566 |
+
" vector element\n",
|
| 567 |
+
" \"\"\"\n",
|
| 568 |
+
" el = 0\n",
|
| 569 |
+
" if vw in words:\n",
|
| 570 |
+
" if byCount:\n",
|
| 571 |
+
" if normalized:\n",
|
| 572 |
+
" el = self.wordFreq[vw]\n",
|
| 573 |
+
" else:\n",
|
| 574 |
+
" el = self.wordCounter[vw]\n",
|
| 575 |
+
" else:\n",
|
| 576 |
+
" el = 1\n",
|
| 577 |
+
" return el\n",
|
| 578 |
+
"\n",
|
| 579 |
+
" def save(self, saveFile):\n",
|
| 580 |
+
" \"\"\"\n",
|
| 581 |
+
" save\n",
|
| 582 |
+
" \"\"\"\n",
|
| 583 |
+
" sf = open(saveFile, \"wb\")\n",
|
| 584 |
+
" pickle.dump(self, sf)\n",
|
| 585 |
+
" sf.close()\n",
|
| 586 |
+
"\n",
|
| 587 |
+
" # load \n",
|
| 588 |
+
" @staticmethod\n",
|
| 589 |
+
" def load(saveFile):\n",
|
| 590 |
+
" \"\"\"\n",
|
| 591 |
+
" load\n",
|
| 592 |
+
" \"\"\"\n",
|
| 593 |
+
" sf = open(saveFile, \"rb\")\n",
|
| 594 |
+
" tfidf = pickle.load(sf)\n",
|
| 595 |
+
" sf.close()\n",
|
| 596 |
+
" return tfidf\n",
|
| 597 |
+
"\n",
|
| 598 |
+
"# bigram\n",
|
| 599 |
+
"class BiGram(NGram):\n",
|
| 600 |
+
" def __init__(self, vocFilt, verbose=False):\n",
|
| 601 |
+
" \"\"\"\n",
|
| 602 |
+
" initialize\n",
|
| 603 |
+
" \"\"\"\n",
|
| 604 |
+
" super(BiGram, self).__init__(vocFilt, verbose)\n",
|
| 605 |
+
"\n",
|
| 606 |
+
" def toNGram(self, words):\n",
|
| 607 |
+
" \"\"\"\n",
|
| 608 |
+
" convert to Ngrams\n",
|
| 609 |
+
" \"\"\"\n",
|
| 610 |
+
" return self.toBiGram(words)\n",
|
| 611 |
+
"\n",
|
| 612 |
+
"# trigram\n",
|
| 613 |
+
"class TriGram(NGram):\n",
|
| 614 |
+
" def __init__(self, vocFilt, verbose=False):\n",
|
| 615 |
+
" \"\"\"\n",
|
| 616 |
+
" initialize\n",
|
| 617 |
+
" \"\"\"\n",
|
| 618 |
+
" super(TriGram, self).__init__(vocFilt, verbose)\n",
|
| 619 |
+
"\n",
|
| 620 |
+
" def toNGram(self, words):\n",
|
| 621 |
+
" \"\"\"\n",
|
| 622 |
+
" convert to Ngrams\n",
|
| 623 |
+
" \"\"\"\n",
|
| 624 |
+
" return self.toTriGram(words)\n",
|
| 625 |
+
"\n",
|
| 626 |
+
"\n",
|
| 627 |
+
"\n",
|
| 628 |
+
"class DocSentences:\n",
|
| 629 |
+
" \"\"\"\n",
|
| 630 |
+
" sentence processor\n",
|
| 631 |
+
" \"\"\"\n",
|
| 632 |
+
" def __init__(self, filePath, minLength, verbose, text=None):\n",
|
| 633 |
+
" \"\"\"\n",
|
| 634 |
+
" initialize\n",
|
| 635 |
+
" \"\"\"\n",
|
| 636 |
+
" if filePath:\n",
|
| 637 |
+
" self.filePath = filePath\n",
|
| 638 |
+
" with open(filePath, 'r') as contentFile:\n",
|
| 639 |
+
" content = contentFile.read()\n",
|
| 640 |
+
" elif text:\n",
|
| 641 |
+
" content = text\n",
|
| 642 |
+
" else:\n",
|
| 643 |
+
" raise valueError(\"either file path or text must be provided\")\n",
|
| 644 |
+
"\n",
|
| 645 |
+
" #self.sentences = content.split('.')\n",
|
| 646 |
+
" self.verbose = verbose\n",
|
| 647 |
+
" tp = TextPreProcessor()\n",
|
| 648 |
+
" content = tp.removeNonAsciiFromText(content)\n",
|
| 649 |
+
" sentences = sent_tokenize(content)\n",
|
| 650 |
+
" self.sentences = list(filter(lambda s: len(nltk.word_tokenize(s)) >= minLength, sentences))\n",
|
| 651 |
+
" if self.verbose:\n",
|
| 652 |
+
" print (\"num of senteces after length filter \" + str(len(self.sentences)))\n",
|
| 653 |
+
" self.sentencesAsTokens = [clean(s, tp, verbose) for s in self.sentences]\t\n",
|
| 654 |
+
"\n",
|
| 655 |
+
" # get sentence tokens\n",
|
| 656 |
+
" def getSentencesAsTokens(self):\n",
|
| 657 |
+
" return self.sentencesAsTokens\n",
|
| 658 |
+
"\n",
|
| 659 |
+
" # get sentences\n",
|
| 660 |
+
" def getSentences(self):\n",
|
| 661 |
+
" return self.sentences\n",
|
| 662 |
+
"\n",
|
| 663 |
+
" # build term freq table\n",
|
| 664 |
+
" def getTermFreqTable(self):\n",
|
| 665 |
+
" # term count table for all words\n",
|
| 666 |
+
" termTable = TfIdf(None, False)\n",
|
| 667 |
+
" sentWords = self.getSentencesAsTokens()\n",
|
| 668 |
+
" for seWords in sentWords:\n",
|
| 669 |
+
" termTable.countDocWords(seWords)\n",
|
| 670 |
+
" return termTable\n",
|
| 671 |
+
"\n",
|
| 672 |
+
"# sentence processor\n",
|
| 673 |
+
"class WordVectorContainer:\n",
|
| 674 |
+
" def __init__(self, dirPath, verbose):\n",
|
| 675 |
+
" \"\"\"\n",
|
| 676 |
+
" initialize\n",
|
| 677 |
+
" \"\"\"\n",
|
| 678 |
+
" self.docs = list()\n",
|
| 679 |
+
" self.wordVectors = list()\n",
|
| 680 |
+
" self.tp = TextPreProcessor()\n",
|
| 681 |
+
" self.similarityAlgo = \"cosine\"\n",
|
| 682 |
+
" self.simAlgoNormalizer = None\n",
|
| 683 |
+
" self.termTable = None\n",
|
| 684 |
+
"\n",
|
| 685 |
+
"\n",
|
| 686 |
+
" def addDir(self, dirPath):\n",
|
| 687 |
+
" \"\"\"\n",
|
| 688 |
+
" add content of all files ina directory\n",
|
| 689 |
+
" \"\"\"\n",
|
| 690 |
+
" docs, filePaths = getFileContent(dirPath, verbose)\n",
|
| 691 |
+
" self.docs.extend(docs)\n",
|
| 692 |
+
" self.wordVectors.extend([clean(doc, self.tp, verbose) for doc in docs])\n",
|
| 693 |
+
"\n",
|
| 694 |
+
" def addFile(self, filePath):\n",
|
| 695 |
+
" \"\"\"\n",
|
| 696 |
+
" add file content\n",
|
| 697 |
+
" \"\"\"\n",
|
| 698 |
+
" with open(filePath, 'r') as contentFile:\n",
|
| 699 |
+
" content = contentFile.read()\n",
|
| 700 |
+
" self.wordVectors.append(clean(content, self.tp, verbose))\n",
|
| 701 |
+
"\n",
|
| 702 |
+
" def addText(self, text):\n",
|
| 703 |
+
" \"\"\"\n",
|
| 704 |
+
" add text\n",
|
| 705 |
+
" \"\"\"\n",
|
| 706 |
+
" self.wordVectors.append(clean(text, self.tp, verbose))\n",
|
| 707 |
+
"\n",
|
| 708 |
+
" def addWords(self, words):\n",
|
| 709 |
+
" \"\"\"\n",
|
| 710 |
+
" add words\n",
|
| 711 |
+
" \"\"\"\n",
|
| 712 |
+
" self.wordVectors.append(words)\n",
|
| 713 |
+
"\n",
|
| 714 |
+
" def withSimilarityAlgo(self, algo, normalizer=None):\n",
|
| 715 |
+
" \"\"\"\n",
|
| 716 |
+
" set similarity algo\n",
|
| 717 |
+
" \"\"\"\n",
|
| 718 |
+
" self.similarityAlgo = algo\n",
|
| 719 |
+
" self.simAlgoNormalizer = normalizer\n",
|
| 720 |
+
"\n",
|
| 721 |
+
" def getDocsWords(self):\n",
|
| 722 |
+
" \"\"\"\n",
|
| 723 |
+
" get word vectors\n",
|
| 724 |
+
" \"\"\"\n",
|
| 725 |
+
" return self.wordVectors\n",
|
| 726 |
+
"\n",
|
| 727 |
+
" def getDocs(self):\n",
|
| 728 |
+
" \"\"\"\n",
|
| 729 |
+
" get docs\n",
|
| 730 |
+
" \"\"\"\n",
|
| 731 |
+
" return self.docs\n",
|
| 732 |
+
"\n",
|
| 733 |
+
" def getTermFreqTable(self):\n",
|
| 734 |
+
" \"\"\"\n",
|
| 735 |
+
" term count table for all words\n",
|
| 736 |
+
" \"\"\"\n",
|
| 737 |
+
" self.termTable = TfIdf(None, False)\n",
|
| 738 |
+
" for words in self.wordVectors:\n",
|
| 739 |
+
" self.termTable.countDocWords(words)\n",
|
| 740 |
+
" self.termTable.getWordFreq()\n",
|
| 741 |
+
" return self.termTable\n",
|
| 742 |
+
"\n",
|
| 743 |
+
" def getPairWiseSimilarity(self, byCount, normalized):\n",
|
| 744 |
+
" \"\"\"\n",
|
| 745 |
+
" pair wise similarity\n",
|
| 746 |
+
" \"\"\"\n",
|
| 747 |
+
" self.getNumWordVectors()\n",
|
| 748 |
+
"\n",
|
| 749 |
+
" size = len(self.wordVectors)\n",
|
| 750 |
+
" simArray = np.empty(shape=(size,size))\n",
|
| 751 |
+
" for i in range(size):\n",
|
| 752 |
+
" simArray[i][i] = 1.0\n",
|
| 753 |
+
"\n",
|
| 754 |
+
" for i in range(size):\n",
|
| 755 |
+
" for j in range(i+1, size):\n",
|
| 756 |
+
" if self.similarityAlgo == \"cosine\":\n",
|
| 757 |
+
" sim = cosineSimilarity(self.numWordVectors[i], self.numWordVectors[j])\n",
|
| 758 |
+
" elif self.similarityAlgo == \"jaccard\":\n",
|
| 759 |
+
" sim = jaccardSimilarity(self.wordVectors[i], self.wordVectors[j],\\\n",
|
| 760 |
+
" self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
|
| 761 |
+
" else:\n",
|
| 762 |
+
" raise ValueError(\"invalid similarity algorithms\")\n",
|
| 763 |
+
" simArray[i][j] = sim\n",
|
| 764 |
+
" simArray[j][i] = sim\n",
|
| 765 |
+
" return simArray\n",
|
| 766 |
+
"\n",
|
| 767 |
+
" def getInterSetSimilarity(self, byCount, normalized, split):\n",
|
| 768 |
+
" \"\"\"\n",
|
| 769 |
+
" inter set pair wise similarity\n",
|
| 770 |
+
" \"\"\"\n",
|
| 771 |
+
" self.getNumWordVectors()\n",
|
| 772 |
+
" size = len(self.wordVectors)\n",
|
| 773 |
+
" if not self.similarityAlgo == \"jaccard\":\n",
|
| 774 |
+
" firstNumVec = self.numWordVectors[:split]\n",
|
| 775 |
+
" secNumVec = self.numWordVectors[split:]\n",
|
| 776 |
+
" fiSize = len(firstNumVec)\n",
|
| 777 |
+
" seSize = len(secNumVec)\n",
|
| 778 |
+
" else:\n",
|
| 779 |
+
" firstVec = self.wordVectors[:split]\n",
|
| 780 |
+
" secVec = self.wordVectors[split:]\n",
|
| 781 |
+
" fiSize = len(firstVec)\n",
|
| 782 |
+
" seSize = len(secVec)\n",
|
| 783 |
+
"\n",
|
| 784 |
+
" simArray = np.empty(shape=(fiSize,seSize))\n",
|
| 785 |
+
" for i in range(fiSize):\n",
|
| 786 |
+
" for j in range(seSize):\n",
|
| 787 |
+
" if self.similarityAlgo == \"cosine\":\n",
|
| 788 |
+
" sim = cosineSimilarity(firstNumVec[i], secNumVec[j])\n",
|
| 789 |
+
" elif self.similarityAlgo == \"jaccard\":\n",
|
| 790 |
+
" sim = jaccardSimilarity(firstVec[i], secVec[j],\\\n",
|
| 791 |
+
" self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
|
| 792 |
+
" else:\n",
|
| 793 |
+
" raise ValueError(\"invalid similarity algorithms\")\n",
|
| 794 |
+
" simArray[i][j] = sim\n",
|
| 795 |
+
" return simArray\n",
|
| 796 |
+
"\n",
|
| 797 |
+
" def getNumWordVectors(self):\n",
|
| 798 |
+
" \"\"\"\n",
|
| 799 |
+
" get vectors\n",
|
| 800 |
+
" \"\"\"\n",
|
| 801 |
+
" if not self.similarityAlgo == \"jaccard\":\n",
|
| 802 |
+
" if self.numWordVectors is None:\n",
|
| 803 |
+
" self.numWordVectors = list(map(lambda wv: self.termTable.getVector(wv, byCount, normalized), self.wordVectors))\n",
|
| 804 |
+
"\n",
|
| 805 |
+
"# fragments documents into whole doc, paragraph or passages\n",
|
| 806 |
+
"class TextFragmentGenerator:\n",
|
| 807 |
+
" def __init__(self, level, minParNl, passSize, verbose=False):\n",
|
| 808 |
+
" \"\"\"\n",
|
| 809 |
+
" initialize\n",
|
| 810 |
+
" \"\"\"\n",
|
| 811 |
+
" self.level = level\n",
|
| 812 |
+
" self.minParNl = minParNl\n",
|
| 813 |
+
" self.passSize = passSize\n",
|
| 814 |
+
" self.fragments = None\n",
|
| 815 |
+
" self.verbose = verbose\n",
|
| 816 |
+
"\n",
|
| 817 |
+
" def loadDocs(self, fpaths):\n",
|
| 818 |
+
" \"\"\"\n",
|
| 819 |
+
" loads documents from one file, multiple files or all files under directory\n",
|
| 820 |
+
" \"\"\"\n",
|
| 821 |
+
" fPaths = fpaths.split(\",\")\n",
|
| 822 |
+
" if len(fPaths) == 1:\n",
|
| 823 |
+
" if os.path.isfile(fPaths[0]):\n",
|
| 824 |
+
" #one file\n",
|
| 825 |
+
" if self.verbose:\n",
|
| 826 |
+
" print(\"got one file from path\")\n",
|
| 827 |
+
" dnames = fPaths\n",
|
| 828 |
+
" docStr = getOneFileContent(fPaths[0])\n",
|
| 829 |
+
" dtexts = [docStr]\n",
|
| 830 |
+
" else:\n",
|
| 831 |
+
" #all files under directory\n",
|
| 832 |
+
" if self.verbose:\n",
|
| 833 |
+
" print(\"got all files under directory from path\")\n",
|
| 834 |
+
" dtexts, dnames = getFileContent(fPaths[0])\n",
|
| 835 |
+
" if self.verbose:\n",
|
| 836 |
+
" print(\"found {} files\".format(len(dtexts)))\n",
|
| 837 |
+
" else:\n",
|
| 838 |
+
" #list of files\n",
|
| 839 |
+
" if self.verbose: \n",
|
| 840 |
+
" print(\"got list of files from path\")\n",
|
| 841 |
+
" dnames = fPaths\n",
|
| 842 |
+
" dtexts = list(map(getOneFileContent, fpaths))\n",
|
| 843 |
+
" if self.verbose:\n",
|
| 844 |
+
" print(\"found {} files\".format(len(dtexts)))\n",
|
| 845 |
+
"\n",
|
| 846 |
+
" ndocs = (dtexts, dnames)\t\n",
|
| 847 |
+
" if self.verbose:\n",
|
| 848 |
+
" print(\"docs\")\n",
|
| 849 |
+
" for dn, dt in zip(dnames, dtexts):\n",
|
| 850 |
+
" print(dn + \"\\t\" + dt[:40])\n",
|
| 851 |
+
"\n",
|
| 852 |
+
" return ndocs\n",
|
| 853 |
+
"\n",
|
| 854 |
+
" def generateFragmentsFromFiles(self, fpaths):\n",
|
| 855 |
+
" \"\"\"\n",
|
| 856 |
+
" fragments documents into whole doc, paragraph or passages\n",
|
| 857 |
+
" \"\"\"\n",
|
| 858 |
+
" dtexts, dnames = self.loadDocs(fpaths)\n",
|
| 859 |
+
" return self.generateFragments(dtexts, dnames)\n",
|
| 860 |
+
"\n",
|
| 861 |
+
"\n",
|
| 862 |
+
" def generateFragmentsFromNamedDocs(self, ndocs):\n",
|
| 863 |
+
" \"\"\"\n",
|
| 864 |
+
" fragments documents into whole doc, paragraph or passages\n",
|
| 865 |
+
" \"\"\"\n",
|
| 866 |
+
" dtexts = list(map(lambda nd : nd[1], ndocs))\n",
|
| 867 |
+
" dnames = list(map(lambda nd : nd[0], ndocs))\n",
|
| 868 |
+
" #for i in range(len(dtexts)):\n",
|
| 869 |
+
" #\tprint(dnames[i])\n",
|
| 870 |
+
" #\tprint(dtexts[i][:40])\n",
|
| 871 |
+
" return self.generateFragments(dtexts, dnames)\n",
|
| 872 |
+
"\n",
|
| 873 |
+
" def generateFragments(self, dtexts, dnames):\n",
|
| 874 |
+
" \"\"\"\n",
|
| 875 |
+
" fragments documents into whole doc, paragraph or passages\n",
|
| 876 |
+
" \"\"\"\n",
|
| 877 |
+
" if self.level == \"para\" or self.level == \"passage\":\n",
|
| 878 |
+
" #split paras\n",
|
| 879 |
+
" dptexts = list()\n",
|
| 880 |
+
" dpnames = list()\n",
|
| 881 |
+
" for dt, dn in zip(dtexts, dnames):\n",
|
| 882 |
+
" paras = getParas(dt, self.minParNl)\n",
|
| 883 |
+
" if self.verbose:\n",
|
| 884 |
+
" print(dn)\n",
|
| 885 |
+
" print(\"no of paras {}\".format(len(paras)))\n",
|
| 886 |
+
" dptexts.extend(paras)\n",
|
| 887 |
+
" pnames = list(map(lambda i : dn + \":\" + str(i), range(len(paras))))\n",
|
| 888 |
+
" dpnames.extend(pnames)\n",
|
| 889 |
+
" dtexts = dptexts\n",
|
| 890 |
+
" dnames = dpnames\n",
|
| 891 |
+
"\n",
|
| 892 |
+
" if self.level == \"passage\":\n",
|
| 893 |
+
" #split each para into passages\n",
|
| 894 |
+
" dptexts = list()\n",
|
| 895 |
+
" dpnames = list()\n",
|
| 896 |
+
" for dt, dn in zip(dtexts, dnames):\n",
|
| 897 |
+
" sents = sent_tokenize(dt.strip())\t\t\t\n",
|
| 898 |
+
" if self.verbose:\n",
|
| 899 |
+
" print(dn)\n",
|
| 900 |
+
" print(\"no of sentences {}\".format(len(sents)))\n",
|
| 901 |
+
" span = self.passSize\n",
|
| 902 |
+
" if len(sents) <= span:\n",
|
| 903 |
+
" pass\n",
|
| 904 |
+
" else:\n",
|
| 905 |
+
" for i in range(0, len(sents) - span, 1):\n",
|
| 906 |
+
" dptext = None\n",
|
| 907 |
+
" for j in range(span):\n",
|
| 908 |
+
" if dptext is None:\n",
|
| 909 |
+
" dptext = sents[i + j] + \". \"\n",
|
| 910 |
+
" else:\n",
|
| 911 |
+
" dptext = dptext + sents[i + j] + \". \" \n",
|
| 912 |
+
" dpname = dn + \":\" + str(i)\n",
|
| 913 |
+
" dptexts.append(dptext)\n",
|
| 914 |
+
" dpnames.append(dpname)\n",
|
| 915 |
+
"\n",
|
| 916 |
+
" dtexts = dptexts\n",
|
| 917 |
+
" dnames = dpnames\n",
|
| 918 |
+
"\n",
|
| 919 |
+
" self.fragments = list(zip(dnames, dtexts))\n",
|
| 920 |
+
" #if self.verbose:\n",
|
| 921 |
+
" #\tprint(\"num fragments {}\".format(len(self.fragments)))\n",
|
| 922 |
+
" return self.fragments\n",
|
| 923 |
+
"\n",
|
| 924 |
+
" def showFragments(self):\n",
|
| 925 |
+
" \"\"\"\n",
|
| 926 |
+
" show fragments\n",
|
| 927 |
+
" \"\"\"\n",
|
| 928 |
+
" print(\"showing all \" + self.level + \" for the first 40 characters\")\n",
|
| 929 |
+
" for dn, dt in self.fragments:\n",
|
| 930 |
+
" print(dn + \"\\t\" + dt[:40])\n",
|
| 931 |
+
"\n",
|
| 932 |
+
" def isDocLevel(self):\n",
|
| 933 |
+
" \"\"\"\n",
|
| 934 |
+
" true if fragment is at doc level\n",
|
| 935 |
+
" \"\"\"\n",
|
| 936 |
+
" return self.level != \"para\" and self.level != \"passage\"\n",
|
| 937 |
+
"\n",
|
| 938 |
+
"# clean doc to create term array\n",
|
| 939 |
+
"def clean(doc, preprocessor, verbose):\n",
|
| 940 |
+
" \"\"\"\n",
|
| 941 |
+
" text pre process\n",
|
| 942 |
+
" \"\"\"\n",
|
| 943 |
+
" if verbose:\n",
|
| 944 |
+
" print (\"--raw doc\")\n",
|
| 945 |
+
" print (doc)\n",
|
| 946 |
+
" #print \"next clean\"\n",
|
| 947 |
+
" doc = preprocessor.removeNonAsciiFromText(doc)\n",
|
| 948 |
+
" words = preprocessor.tokenize(doc)\n",
|
| 949 |
+
" words = preprocessor.allow(words)\n",
|
| 950 |
+
" words = preprocessor.toLowercase(words)\n",
|
| 951 |
+
" words = preprocessor.removeStopwords(words)\n",
|
| 952 |
+
" words = preprocessor.removeShortWords(words, 3)\n",
|
| 953 |
+
" words = preprocessor.removePunctuation(words)\n",
|
| 954 |
+
" words = preprocessor.lemmatizeWords(words)\n",
|
| 955 |
+
" #words = preprocessor.removeNonAscii(words)\n",
|
| 956 |
+
" if verbose:\n",
|
| 957 |
+
" print (\"--after pre processing\")\n",
|
| 958 |
+
" print (words)\n",
|
| 959 |
+
" return words\n",
|
| 960 |
+
"\n",
|
| 961 |
+
"# get sentences\n",
|
| 962 |
+
"def getSentences(filePath):\n",
|
| 963 |
+
" \"\"\"\n",
|
| 964 |
+
" text pre process\n",
|
| 965 |
+
" \"\"\"\n",
|
| 966 |
+
" with open(filePath, 'r') as contentFile:\n",
|
| 967 |
+
" content = contentFile.read()\n",
|
| 968 |
+
" sentences = content.split('.')\n",
|
| 969 |
+
" return sentences\n",
|
| 970 |
+
"\n",
|
| 971 |
+
"def getParas(text, minParNl=2):\n",
|
| 972 |
+
" \"\"\"\n",
|
| 973 |
+
" split into paras\n",
|
| 974 |
+
" \"\"\"\n",
|
| 975 |
+
" regx = \"\\n+\" if minParNl == 1 else \"\\n{2,}\"\n",
|
| 976 |
+
" paras = re.split(regx, text.replace(\"\\r\\n\", \"\\n\"))\n",
|
| 977 |
+
" return paras\n"
|
| 978 |
+
]
|
| 979 |
+
}
|
| 980 |
+
],
|
| 981 |
+
"metadata": {
|
| 982 |
+
"kernelspec": {
|
| 983 |
+
"display_name": "Python 3 (ipykernel)",
|
| 984 |
+
"language": "python",
|
| 985 |
+
"name": "python3"
|
| 986 |
+
},
|
| 987 |
+
"language_info": {
|
| 988 |
+
"codemirror_mode": {
|
| 989 |
+
"name": "ipython",
|
| 990 |
+
"version": 3
|
| 991 |
+
},
|
| 992 |
+
"file_extension": ".py",
|
| 993 |
+
"mimetype": "text/x-python",
|
| 994 |
+
"name": "python",
|
| 995 |
+
"nbconvert_exporter": "python",
|
| 996 |
+
"pygments_lexer": "ipython3",
|
| 997 |
+
"version": "3.9.12"
|
| 998 |
+
}
|
| 999 |
+
},
|
| 1000 |
+
"nbformat": 4,
|
| 1001 |
+
"nbformat_minor": 5
|
| 1002 |
+
}
|
lib/util.ipynb
ADDED
|
@@ -0,0 +1,2141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "21cb09bb",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"from random import randint\n",
|
| 13 |
+
"import random\n",
|
| 14 |
+
"import time\n",
|
| 15 |
+
"import uuid\n",
|
| 16 |
+
"from datetime import datetime\n",
|
| 17 |
+
"import math\n",
|
| 18 |
+
"import numpy as np\n",
|
| 19 |
+
"import pandas as pd\n",
|
| 20 |
+
"import matplotlib.pyplot as plt\n",
|
| 21 |
+
"import numpy as np\n",
|
| 22 |
+
"import logging\n",
|
| 23 |
+
"import logging.handlers\n",
|
| 24 |
+
"import pickle\n",
|
| 25 |
+
"from contextlib import contextmanager\n",
|
| 26 |
+
"\n",
|
| 27 |
+
"tokens = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\",\n",
|
| 28 |
+
" \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
|
| 29 |
+
"numTokens = tokens[:10]\n",
|
| 30 |
+
"alphaTokens = tokens[10:36]\n",
|
| 31 |
+
"loCaseChars = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
|
| 32 |
+
"\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"typeInt = \"int\"\n",
|
| 35 |
+
"typeFloat = \"float\"\n",
|
| 36 |
+
"typeString = \"string\"\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"secInMinute = 60\n",
|
| 39 |
+
"secInHour = 60 * 60\n",
|
| 40 |
+
"secInDay = 24 * secInHour\n",
|
| 41 |
+
"secInWeek = 7 * secInDay\n",
|
| 42 |
+
"secInYear = 365 * secInDay\n",
|
| 43 |
+
"secInMonth = secInYear / 12\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"minInHour = 60\n",
|
| 46 |
+
"minInDay = 24 * minInHour\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"ftPerYard = 3\n",
|
| 49 |
+
"ftPerMile = ftPerYard * 1760\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"def genID(size):\n",
|
| 53 |
+
" \"\"\"\n",
|
| 54 |
+
" generates ID\n",
|
| 55 |
+
"\n",
|
| 56 |
+
" Parameters\n",
|
| 57 |
+
" size : size of ID\n",
|
| 58 |
+
" \"\"\"\n",
|
| 59 |
+
" id = \"\"\n",
|
| 60 |
+
" for i in range(size):\n",
|
| 61 |
+
" id = id + selectRandomFromList(tokens)\n",
|
| 62 |
+
" return id\n",
|
| 63 |
+
"\n",
|
| 64 |
+
"def genIdList(numId, idSize):\n",
|
| 65 |
+
" \"\"\"\n",
|
| 66 |
+
" generate list of IDs\n",
|
| 67 |
+
"\n",
|
| 68 |
+
" Parameters:\n",
|
| 69 |
+
" numId: number of Ids\n",
|
| 70 |
+
" idSize: ID size\n",
|
| 71 |
+
" \"\"\"\n",
|
| 72 |
+
" iDs = []\n",
|
| 73 |
+
" for i in range(numId):\n",
|
| 74 |
+
" iDs.append(genID(idSize))\n",
|
| 75 |
+
" return iDs\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"def genNumID(size):\n",
|
| 78 |
+
" \"\"\"\n",
|
| 79 |
+
" generates ID consisting of digits onl\n",
|
| 80 |
+
"\n",
|
| 81 |
+
" Parameters\n",
|
| 82 |
+
" size : size of ID\n",
|
| 83 |
+
" \"\"\"\n",
|
| 84 |
+
" id = \"\"\n",
|
| 85 |
+
" for i in range(size):\n",
|
| 86 |
+
" id = id + selectRandomFromList(numTokens)\n",
|
| 87 |
+
" return id\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"def genLowCaseID(size):\n",
|
| 90 |
+
" \"\"\"\n",
|
| 91 |
+
" generates ID consisting of lower case chars\n",
|
| 92 |
+
"\n",
|
| 93 |
+
" Parameters\n",
|
| 94 |
+
" size : size of ID\n",
|
| 95 |
+
" \"\"\"\n",
|
| 96 |
+
" id = \"\"\n",
|
| 97 |
+
" for i in range(size):\n",
|
| 98 |
+
" id = id + selectRandomFromList(loCaseChars)\n",
|
| 99 |
+
" return id\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"def genNumIdList(numId, idSize):\n",
|
| 102 |
+
" \"\"\"\n",
|
| 103 |
+
" generate list of numeric IDs\n",
|
| 104 |
+
"\n",
|
| 105 |
+
" Parameters:\n",
|
| 106 |
+
" numId: number of Ids\n",
|
| 107 |
+
" idSize: ID size\n",
|
| 108 |
+
" \"\"\"\n",
|
| 109 |
+
" iDs = []\n",
|
| 110 |
+
" for i in range(numId):\n",
|
| 111 |
+
" iDs.append(genNumID(idSize))\n",
|
| 112 |
+
" return iDs\n",
|
| 113 |
+
"\n",
|
| 114 |
+
"def genNameInitial():\n",
|
| 115 |
+
" \"\"\"\n",
|
| 116 |
+
" generate name initial\n",
|
| 117 |
+
" \"\"\"\n",
|
| 118 |
+
" return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens)\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"def genPhoneNum(arCode):\n",
|
| 121 |
+
" \"\"\"\n",
|
| 122 |
+
" generates phone number\n",
|
| 123 |
+
"\n",
|
| 124 |
+
" Parameters\n",
|
| 125 |
+
" arCode: area code\n",
|
| 126 |
+
" \"\"\"\n",
|
| 127 |
+
" phNum = genNumID(7)\n",
|
| 128 |
+
" return arCode + str(phNum)\n",
|
| 129 |
+
"\n",
|
| 130 |
+
"def selectRandomFromList(ldata):\n",
|
| 131 |
+
" \"\"\"\n",
|
| 132 |
+
" select an element randomly from a lis\n",
|
| 133 |
+
"\n",
|
| 134 |
+
" Parameters\n",
|
| 135 |
+
" ldata : list data\n",
|
| 136 |
+
" \"\"\"\n",
|
| 137 |
+
" return ldata[randint(0, len(ldata)-1)]\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"def selectOtherRandomFromList(ldata, cval):\n",
|
| 140 |
+
" \"\"\"\n",
|
| 141 |
+
" select an element randomly from a list excluding the given one\n",
|
| 142 |
+
"\n",
|
| 143 |
+
" Parameters\n",
|
| 144 |
+
" ldata : list data\n",
|
| 145 |
+
" cval : value to be excluded\n",
|
| 146 |
+
" \"\"\"\n",
|
| 147 |
+
" nval = selectRandomFromList(ldata)\n",
|
| 148 |
+
" while nval == cval:\n",
|
| 149 |
+
" nval = selectRandomFromList(ldata)\n",
|
| 150 |
+
" return nval\n",
|
| 151 |
+
"\n",
|
| 152 |
+
"def selectRandomSubListFromList(ldata, num):\n",
|
| 153 |
+
" \"\"\"\n",
|
| 154 |
+
" generates random sublist from a list without replacemment\n",
|
| 155 |
+
"\n",
|
| 156 |
+
" Parameters\n",
|
| 157 |
+
" ldata : list data\n",
|
| 158 |
+
" num : output list size\n",
|
| 159 |
+
" \"\"\"\n",
|
| 160 |
+
" assertLesser(num, len(ldata), \"size of sublist to be sampled greater than or equal to main list\")\n",
|
| 161 |
+
" i = randint(0, len(ldata)-1)\n",
|
| 162 |
+
" sel = ldata[i]\n",
|
| 163 |
+
" selSet = {i}\n",
|
| 164 |
+
" selList = [sel]\n",
|
| 165 |
+
" while (len(selSet) < num):\n",
|
| 166 |
+
" i = randint(0, len(ldata)-1)\n",
|
| 167 |
+
" if (i not in selSet):\n",
|
| 168 |
+
" sel = ldata[i]\n",
|
| 169 |
+
" selSet.add(i)\n",
|
| 170 |
+
" selList.append(sel)\n",
|
| 171 |
+
" return selList\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"def selectRandomSubListFromListWithRepl(ldata, num):\n",
|
| 174 |
+
" \"\"\"\n",
|
| 175 |
+
" generates random sublist from a list with replacemment\n",
|
| 176 |
+
"\n",
|
| 177 |
+
" Parameters\n",
|
| 178 |
+
" ldata : list data\n",
|
| 179 |
+
" num : output list size\n",
|
| 180 |
+
" \"\"\"\n",
|
| 181 |
+
" return list(map(lambda i : selectRandomFromList(ldata), range(num)))\n",
|
| 182 |
+
"\n",
|
| 183 |
+
"def selectRandomFromDict(ddata):\n",
|
| 184 |
+
" \"\"\"\n",
|
| 185 |
+
" select an element randomly from a dictionary\n",
|
| 186 |
+
"\n",
|
| 187 |
+
" Parameters\n",
|
| 188 |
+
" ddata : dictionary data\n",
|
| 189 |
+
" \"\"\"\n",
|
| 190 |
+
" dkeys = list(ddata.keys())\n",
|
| 191 |
+
" dk = selectRandomFromList(dkeys)\n",
|
| 192 |
+
" el = (dk, ddata[dk])\n",
|
| 193 |
+
" return el\n",
|
| 194 |
+
"\n",
|
| 195 |
+
"def setListRandomFromList(ldata, ldataRepl):\n",
|
| 196 |
+
" \"\"\"\n",
|
| 197 |
+
" sets some elents in the first list randomly with elements from the second list\n",
|
| 198 |
+
"\n",
|
| 199 |
+
" Parameters\n",
|
| 200 |
+
" ldata : list data\n",
|
| 201 |
+
" ldataRepl : list with replacement data\n",
|
| 202 |
+
" \"\"\"\n",
|
| 203 |
+
" l = len(ldata)\n",
|
| 204 |
+
" selSet = set()\n",
|
| 205 |
+
" for d in ldataRepl:\n",
|
| 206 |
+
" i = randint(0, l-1)\n",
|
| 207 |
+
" while i in selSet:\n",
|
| 208 |
+
" i = randint(0, l-1)\n",
|
| 209 |
+
" ldata[i] = d\n",
|
| 210 |
+
" selSet.add(i)\n",
|
| 211 |
+
"\n",
|
| 212 |
+
"def genIpAddress():\n",
|
| 213 |
+
" \"\"\"\n",
|
| 214 |
+
" generates IP address\n",
|
| 215 |
+
" \"\"\"\n",
|
| 216 |
+
" i1 = randint(0,256)\n",
|
| 217 |
+
" i2 = randint(0,256)\n",
|
| 218 |
+
" i3 = randint(0,256)\n",
|
| 219 |
+
" i4 = randint(0,256)\n",
|
| 220 |
+
" ip = \"%d.%d.%d.%d\" %(i1,i2,i3,i4)\n",
|
| 221 |
+
" return ip\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"def curTimeMs():\n",
|
| 224 |
+
" \"\"\"\n",
|
| 225 |
+
" current time in ms\n",
|
| 226 |
+
" \"\"\"\n",
|
| 227 |
+
" return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000)\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"def secDegPolyFit(x1, y1, x2, y2, x3, y3):\n",
|
| 230 |
+
" \"\"\"\n",
|
| 231 |
+
" second deg polynomial \t\n",
|
| 232 |
+
"\n",
|
| 233 |
+
" Parameters\n",
|
| 234 |
+
" x1 : 1st point x\n",
|
| 235 |
+
" y1 : 1st point y\n",
|
| 236 |
+
" x2 : 2nd point x\n",
|
| 237 |
+
" y2 : 2nd point y\n",
|
| 238 |
+
" x3 : 3rd point x\n",
|
| 239 |
+
" y3 : 3rd point y\n",
|
| 240 |
+
" \"\"\"\n",
|
| 241 |
+
" t = (y1 - y2) / (x1 - x2)\n",
|
| 242 |
+
" a = t - (y2 - y3) / (x2 - x3)\n",
|
| 243 |
+
" a = a / (x1 - x3)\n",
|
| 244 |
+
" b = t - a * (x1 + x2)\n",
|
| 245 |
+
" c = y1 - a * x1 * x1 - b * x1\n",
|
| 246 |
+
" return (a, b, c)\n",
|
| 247 |
+
"\n",
|
| 248 |
+
"def range_limit(val, minv, maxv):\n",
|
| 249 |
+
" \"\"\"\n",
|
| 250 |
+
" range limit a value\n",
|
| 251 |
+
"\n",
|
| 252 |
+
" Parameters\n",
|
| 253 |
+
" val : data value\n",
|
| 254 |
+
" minv : minimum\n",
|
| 255 |
+
" maxv : maximum\n",
|
| 256 |
+
" \"\"\"\n",
|
| 257 |
+
" if (val < minv):\n",
|
| 258 |
+
" val = minv\n",
|
| 259 |
+
" elif (val > maxv):\n",
|
| 260 |
+
" val = maxv\n",
|
| 261 |
+
" return val\n",
|
| 262 |
+
"\n",
|
| 263 |
+
"def isInRange(val, minv, maxv):\n",
|
| 264 |
+
" \"\"\"\n",
|
| 265 |
+
" checks if within range\n",
|
| 266 |
+
"\n",
|
| 267 |
+
" Parameters\n",
|
| 268 |
+
" val : data value\n",
|
| 269 |
+
" minv : minimum\n",
|
| 270 |
+
" maxv : maximum\n",
|
| 271 |
+
" \"\"\"\n",
|
| 272 |
+
" return val >= minv and val <= maxv\n",
|
| 273 |
+
"\n",
|
| 274 |
+
"def stripFileLines(filePath, offset):\n",
|
| 275 |
+
" \"\"\"\n",
|
| 276 |
+
" strips number of chars from both ends\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" Parameters\n",
|
| 279 |
+
" filePath : file path\n",
|
| 280 |
+
" offset : offset from both ends of line \n",
|
| 281 |
+
" \"\"\"\n",
|
| 282 |
+
" fp = open(filePath, \"r\")\n",
|
| 283 |
+
" for line in fp:\n",
|
| 284 |
+
" stripped = line[offset:len(line) - 1 - offset]\n",
|
| 285 |
+
" print (stripped)\n",
|
| 286 |
+
" fp.close()\n",
|
| 287 |
+
"\n",
|
| 288 |
+
"def genLatLong(lat1, long1, lat2, long2):\n",
|
| 289 |
+
" \"\"\"\n",
|
| 290 |
+
" generate lat log within limits\n",
|
| 291 |
+
"\n",
|
| 292 |
+
" Parameters\n",
|
| 293 |
+
" lat1 : lat of 1st point\n",
|
| 294 |
+
" long1 : long of 1st point\n",
|
| 295 |
+
" lat2 : lat of 2nd point\n",
|
| 296 |
+
" long2 : long of 2nd point\n",
|
| 297 |
+
" \"\"\"\n",
|
| 298 |
+
" lat = lat1 + (lat2 - lat1) * random.random()\n",
|
| 299 |
+
" longg = long1 + (long2 - long1) * random.random()\n",
|
| 300 |
+
" return (lat, longg)\n",
|
| 301 |
+
"\n",
|
| 302 |
+
"def geoDistance(lat1, long1, lat2, long2):\n",
|
| 303 |
+
" \"\"\"\n",
|
| 304 |
+
" find geo distance in ft\n",
|
| 305 |
+
"\n",
|
| 306 |
+
" Parameters\n",
|
| 307 |
+
" lat1 : lat of 1st point\n",
|
| 308 |
+
" long1 : long of 1st point\n",
|
| 309 |
+
" lat2 : lat of 2nd point\n",
|
| 310 |
+
" long2 : long of 2nd point\n",
|
| 311 |
+
" \"\"\"\n",
|
| 312 |
+
" latDiff = math.radians(lat1 - lat2)\n",
|
| 313 |
+
" longDiff = math.radians(long1 - long2)\n",
|
| 314 |
+
" l1 = math.sin(latDiff/2.0)\n",
|
| 315 |
+
" l2 = math.sin(longDiff/2.0)\n",
|
| 316 |
+
" l3 = math.cos(math.radians(lat1))\n",
|
| 317 |
+
" l4 = math.cos(math.radians(lat2))\n",
|
| 318 |
+
" a = l1 * l1 + l3 * l4 * l2 * l2\n",
|
| 319 |
+
" l5 = math.sqrt(a)\n",
|
| 320 |
+
" l6 = math.sqrt(1.0 - a)\n",
|
| 321 |
+
" c = 2.0 * math.atan2(l5, l6)\n",
|
| 322 |
+
" r = 6371008.8 * 3.280840\n",
|
| 323 |
+
" return c * r\n",
|
| 324 |
+
"\n",
|
| 325 |
+
"def minLimit(val, limit):\n",
|
| 326 |
+
" \"\"\"\n",
|
| 327 |
+
" min limit\n",
|
| 328 |
+
" Parameters\n",
|
| 329 |
+
" \"\"\"\n",
|
| 330 |
+
" if (val < limit):\n",
|
| 331 |
+
" val = limit\n",
|
| 332 |
+
" return val;\n",
|
| 333 |
+
"\n",
|
| 334 |
+
"def maxLimit(val, limit):\n",
|
| 335 |
+
" \"\"\"\n",
|
| 336 |
+
" max limit\n",
|
| 337 |
+
" Parameters\n",
|
| 338 |
+
" \"\"\"\n",
|
| 339 |
+
" if (val > limit):\n",
|
| 340 |
+
" val = limit\n",
|
| 341 |
+
" return val;\n",
|
| 342 |
+
"\n",
|
| 343 |
+
"def rangeSample(val, minLim, maxLim):\n",
|
| 344 |
+
" \"\"\"\n",
|
| 345 |
+
" if out side range sample within range\n",
|
| 346 |
+
"\n",
|
| 347 |
+
" Parameters\n",
|
| 348 |
+
" val : value\n",
|
| 349 |
+
" minLim : minimum\n",
|
| 350 |
+
" maxLim : maximum\n",
|
| 351 |
+
" \"\"\"\n",
|
| 352 |
+
" if val < minLim or val > maxLim:\n",
|
| 353 |
+
" val = randint(minLim, maxLim)\n",
|
| 354 |
+
" return val\n",
|
| 355 |
+
"\n",
|
| 356 |
+
"def genRandomIntListWithinRange(size, minLim, maxLim):\n",
|
| 357 |
+
" \"\"\"\n",
|
| 358 |
+
" random unique list of integers within range\n",
|
| 359 |
+
"\n",
|
| 360 |
+
" Parameters\n",
|
| 361 |
+
" size : size of returned list\n",
|
| 362 |
+
" minLim : minimum\n",
|
| 363 |
+
" maxLim : maximum\n",
|
| 364 |
+
" \"\"\"\n",
|
| 365 |
+
" values = set()\n",
|
| 366 |
+
" for i in range(size):\n",
|
| 367 |
+
" val = randint(minLim, maxLim)\n",
|
| 368 |
+
" while val not in values:\n",
|
| 369 |
+
" values.add(val)\n",
|
| 370 |
+
" return list(values)\n",
|
| 371 |
+
"\n",
|
| 372 |
+
"def preturbScalar(value, vrange):\n",
|
| 373 |
+
" \"\"\"\n",
|
| 374 |
+
" preturbs a mutiplicative value within range\n",
|
| 375 |
+
"\n",
|
| 376 |
+
" Parameters\n",
|
| 377 |
+
" value : data value\n",
|
| 378 |
+
" vrange : value delta fraction\n",
|
| 379 |
+
" \"\"\"\n",
|
| 380 |
+
" scale = 1.0 - vrange + 2 * vrange * random.random() \n",
|
| 381 |
+
" return value * scale\n",
|
| 382 |
+
"\n",
|
| 383 |
+
"def preturbScalarAbs(value, vrange):\n",
|
| 384 |
+
" \"\"\"\n",
|
| 385 |
+
" preturbs an absolute value within range\n",
|
| 386 |
+
"\n",
|
| 387 |
+
" Parameters\n",
|
| 388 |
+
" value : data value\n",
|
| 389 |
+
" vrange : value delta absolute\n",
|
| 390 |
+
" \"\"\"\n",
|
| 391 |
+
" delta = - vrange + 2.0 * vrange * random.random() \n",
|
| 392 |
+
" return value + delta\n",
|
| 393 |
+
"\n",
|
| 394 |
+
"def preturbVector(values, vrange):\n",
|
| 395 |
+
" \"\"\"\n",
|
| 396 |
+
" preturbs a list within range\n",
|
| 397 |
+
"\n",
|
| 398 |
+
" Parameters\n",
|
| 399 |
+
" values : list data\n",
|
| 400 |
+
" vrange : value delta fraction\n",
|
| 401 |
+
" \"\"\"\n",
|
| 402 |
+
" nValues = list(map(lambda va: preturbScalar(va, vrange), values))\n",
|
| 403 |
+
" return nValues\n",
|
| 404 |
+
"\n",
|
| 405 |
+
"def randomShiftVector(values, smin, smax):\n",
|
| 406 |
+
" \"\"\"\n",
|
| 407 |
+
" shifts a list by a random quanity with a range\n",
|
| 408 |
+
"\n",
|
| 409 |
+
" Parameters\n",
|
| 410 |
+
" values : list data\n",
|
| 411 |
+
" smin : samplinf minimum\n",
|
| 412 |
+
" smax : sampling maximum\n",
|
| 413 |
+
" \"\"\"\n",
|
| 414 |
+
" shift = np.random.uniform(smin, smax)\n",
|
| 415 |
+
" return list(map(lambda va: va + shift, values))\n",
|
| 416 |
+
"\n",
|
| 417 |
+
"def floatRange(beg, end, incr):\n",
|
| 418 |
+
" \"\"\"\n",
|
| 419 |
+
" generates float range\n",
|
| 420 |
+
"\n",
|
| 421 |
+
" Parameters\n",
|
| 422 |
+
" beg :range begin\n",
|
| 423 |
+
" end: range end\n",
|
| 424 |
+
" incr : range increment\n",
|
| 425 |
+
" \"\"\"\n",
|
| 426 |
+
" return list(np.arange(beg, end, incr))\n",
|
| 427 |
+
"\n",
|
| 428 |
+
"def shuffle(values, *numShuffles):\n",
|
| 429 |
+
" \"\"\"\n",
|
| 430 |
+
" in place shuffling with swap of pairs\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" Parameters\n",
|
| 433 |
+
" values : list data\n",
|
| 434 |
+
" numShuffles : parameter list for number of shuffles\n",
|
| 435 |
+
" \"\"\"\n",
|
| 436 |
+
" size = len(values)\n",
|
| 437 |
+
" if len(numShuffles) == 0:\n",
|
| 438 |
+
" numShuffle = int(size / 2)\n",
|
| 439 |
+
" elif len(numShuffles) == 1:\n",
|
| 440 |
+
" numShuffle = numShuffles[0]\n",
|
| 441 |
+
" else:\n",
|
| 442 |
+
" numShuffle = randint(numShuffles[0], numShuffles[1])\n",
|
| 443 |
+
" print(\"numShuffle {}\".format(numShuffle))\n",
|
| 444 |
+
" for i in range(numShuffle):\n",
|
| 445 |
+
" first = random.randint(0, size - 1)\n",
|
| 446 |
+
" second = random.randint(0, size - 1)\n",
|
| 447 |
+
" while first == second:\n",
|
| 448 |
+
" second = random.randint(0, size - 1)\n",
|
| 449 |
+
" tmp = values[first]\n",
|
| 450 |
+
" values[first] = values[second]\n",
|
| 451 |
+
" values[second] = tmp\n",
|
| 452 |
+
"\n",
|
| 453 |
+
"\n",
|
| 454 |
+
"def splitList(itms, numGr):\n",
|
| 455 |
+
" \"\"\"\n",
|
| 456 |
+
" splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen\n",
|
| 457 |
+
"\n",
|
| 458 |
+
" Parameters\n",
|
| 459 |
+
" itms ; list of values\t\t\n",
|
| 460 |
+
" numGr : no of groups\n",
|
| 461 |
+
" \"\"\"\n",
|
| 462 |
+
" tcount = len(itms)\n",
|
| 463 |
+
" cItems = list(itms)\n",
|
| 464 |
+
" sz = int(len(cItems) / numGr)\n",
|
| 465 |
+
" groups = list()\n",
|
| 466 |
+
" count = 0\n",
|
| 467 |
+
" for i in range(numGr):\n",
|
| 468 |
+
" if (i == numGr - 1):\n",
|
| 469 |
+
" csz = tcount - count\n",
|
| 470 |
+
" else:\n",
|
| 471 |
+
" csz = sz + randint(-2, 2)\n",
|
| 472 |
+
" count += csz\n",
|
| 473 |
+
" gr = list()\n",
|
| 474 |
+
" for j in range(csz):\n",
|
| 475 |
+
" it = selectRandomFromList(cItems)\n",
|
| 476 |
+
" gr.append(it)\n",
|
| 477 |
+
" cItems.remove(it)\n",
|
| 478 |
+
" groups.append(gr)\n",
|
| 479 |
+
" return groups\n",
|
| 480 |
+
"\n",
|
| 481 |
+
"def multVector(values, vrange):\n",
|
| 482 |
+
" \"\"\"\n",
|
| 483 |
+
" multiplies a list within value range\n",
|
| 484 |
+
"\n",
|
| 485 |
+
" Parameters\n",
|
| 486 |
+
" values : list of values\n",
|
| 487 |
+
" vrange : fraction of vaue to be used to update\n",
|
| 488 |
+
" \"\"\"\n",
|
| 489 |
+
" scale = 1.0 - vrange + 2 * vrange * random.random()\n",
|
| 490 |
+
" nValues = list(map(lambda va: va * scale, values))\n",
|
| 491 |
+
" return nValues\n",
|
| 492 |
+
"\n",
|
| 493 |
+
"def weightedAverage(values, weights):\n",
|
| 494 |
+
" \"\"\"\n",
|
| 495 |
+
" calculates weighted average\n",
|
| 496 |
+
"\n",
|
| 497 |
+
" Parameters\n",
|
| 498 |
+
" values : list of values\n",
|
| 499 |
+
" weights : list of weights\n",
|
| 500 |
+
" \"\"\"\t\t\n",
|
| 501 |
+
" assert len(values) == len(weights), \"values and weights should be same size\"\n",
|
| 502 |
+
" vw = zip(values, weights)\n",
|
| 503 |
+
" wva = list(map(lambda e : e[0] * e[1], vw))\n",
|
| 504 |
+
" #wa = sum(x * y for x, y in vw) / sum(weights)\n",
|
| 505 |
+
" wav = sum(wva) / sum(weights)\n",
|
| 506 |
+
" return wav\n",
|
| 507 |
+
"\n",
|
| 508 |
+
"def extractFields(line, delim, keepIndices):\n",
|
| 509 |
+
" \"\"\"\n",
|
| 510 |
+
" breaks a line into fields and keeps only specified fileds and returns new line\n",
|
| 511 |
+
"\n",
|
| 512 |
+
" Parameters\n",
|
| 513 |
+
" line ; deli separated string\n",
|
| 514 |
+
" delim : delemeter\n",
|
| 515 |
+
" keepIndices : list of indexes to fields to be retained\n",
|
| 516 |
+
" \"\"\"\n",
|
| 517 |
+
" items = line.split(delim)\n",
|
| 518 |
+
" newLine = []\n",
|
| 519 |
+
" for i in keepIndices:\n",
|
| 520 |
+
" newLine.append(line[i])\n",
|
| 521 |
+
" return delim.join(newLine)\n",
|
| 522 |
+
"\n",
|
| 523 |
+
"def remFields(line, delim, remIndices):\n",
|
| 524 |
+
" \"\"\"\n",
|
| 525 |
+
" removes fields from delim separated string\n",
|
| 526 |
+
"\n",
|
| 527 |
+
" Parameters\n",
|
| 528 |
+
" line ; delemeter separated string\n",
|
| 529 |
+
" delim : delemeter\n",
|
| 530 |
+
" remIndices : list of indexes to fields to be removed\n",
|
| 531 |
+
" \"\"\"\n",
|
| 532 |
+
" items = line.split(delim)\n",
|
| 533 |
+
" newLine = []\n",
|
| 534 |
+
" for i in range(len(items)):\n",
|
| 535 |
+
" if not arrayContains(remIndices, i):\n",
|
| 536 |
+
" newLine.append(line[i])\n",
|
| 537 |
+
" return delim.join(newLine)\n",
|
| 538 |
+
"\n",
|
| 539 |
+
"def extractList(data, indices):\n",
|
| 540 |
+
" \"\"\"\n",
|
| 541 |
+
" extracts list from another list, given indices\n",
|
| 542 |
+
"\n",
|
| 543 |
+
" Parameters\n",
|
| 544 |
+
" remIndices : list data\n",
|
| 545 |
+
" indices : list of indexes to fields to be retained\n",
|
| 546 |
+
" \"\"\"\n",
|
| 547 |
+
" if areAllFieldsIncluded(data, indices):\n",
|
| 548 |
+
" exList = data.copy()\n",
|
| 549 |
+
" #print(\"all indices\")\n",
|
| 550 |
+
" else:\n",
|
| 551 |
+
" exList = list()\n",
|
| 552 |
+
" le = len(data)\n",
|
| 553 |
+
" for i in indices:\n",
|
| 554 |
+
" assert i < le , \"index {} out of bound {}\".format(i, le)\n",
|
| 555 |
+
" exList.append(data[i])\n",
|
| 556 |
+
"\n",
|
| 557 |
+
" return exList\n",
|
| 558 |
+
"\n",
|
| 559 |
+
"def arrayContains(arr, item):\n",
|
| 560 |
+
" \"\"\"\n",
|
| 561 |
+
" checks if array contains an item \n",
|
| 562 |
+
"\n",
|
| 563 |
+
" Parameters\n",
|
| 564 |
+
" arr : list data\n",
|
| 565 |
+
" item : item to search\n",
|
| 566 |
+
" \"\"\"\n",
|
| 567 |
+
" contains = True\n",
|
| 568 |
+
" try:\n",
|
| 569 |
+
" arr.index(item)\n",
|
| 570 |
+
" except ValueError:\n",
|
| 571 |
+
" contains = False\n",
|
| 572 |
+
" return contains\n",
|
| 573 |
+
"\n",
|
| 574 |
+
"def strToIntArray(line, delim=\",\"):\n",
|
| 575 |
+
" \"\"\"\n",
|
| 576 |
+
" int array from delim separated string\n",
|
| 577 |
+
"\n",
|
| 578 |
+
" Parameters\n",
|
| 579 |
+
" line ; delemeter separated string\n",
|
| 580 |
+
" \"\"\"\n",
|
| 581 |
+
" arr = line.split(delim)\n",
|
| 582 |
+
" return [int(a) for a in arr]\n",
|
| 583 |
+
"\n",
|
| 584 |
+
"def strToFloatArray(line, delim=\",\"):\n",
|
| 585 |
+
" \"\"\"\n",
|
| 586 |
+
" float array from delim separated string\n",
|
| 587 |
+
"\n",
|
| 588 |
+
" Parameters\n",
|
| 589 |
+
" line ; delemeter separated string\n",
|
| 590 |
+
" \"\"\"\n",
|
| 591 |
+
" arr = line.split(delim)\n",
|
| 592 |
+
" return [float(a) for a in arr]\n",
|
| 593 |
+
"\n",
|
| 594 |
+
"def strListOrRangeToIntArray(line):\n",
|
| 595 |
+
" \"\"\"\n",
|
| 596 |
+
" int array from delim separated string or range\n",
|
| 597 |
+
"\n",
|
| 598 |
+
" Parameters\n",
|
| 599 |
+
" line ; delemeter separated string\n",
|
| 600 |
+
" \"\"\"\n",
|
| 601 |
+
" varr = line.split(\",\")\n",
|
| 602 |
+
" if (len(varr) > 1):\n",
|
| 603 |
+
" iarr = list(map(lambda v: int(v), varr))\n",
|
| 604 |
+
" else:\n",
|
| 605 |
+
" vrange = line.split(\":\")\n",
|
| 606 |
+
" if (len(vrange) == 2):\n",
|
| 607 |
+
" lo = int(vrange[0])\n",
|
| 608 |
+
" hi = int(vrange[1])\n",
|
| 609 |
+
" iarr = list(range(lo, hi+1))\n",
|
| 610 |
+
" else:\n",
|
| 611 |
+
" iarr = [int(line)]\n",
|
| 612 |
+
" return iarr\n",
|
| 613 |
+
"\n",
|
| 614 |
+
"def toStr(val, precision):\n",
|
| 615 |
+
" \"\"\"\n",
|
| 616 |
+
" converts any type to string\t\n",
|
| 617 |
+
"\n",
|
| 618 |
+
" Parameters\n",
|
| 619 |
+
" val : value\n",
|
| 620 |
+
" precision ; precision for float value\n",
|
| 621 |
+
" \"\"\"\n",
|
| 622 |
+
" if type(val) == float or type(val) == np.float64 or type(val) == np.float32:\n",
|
| 623 |
+
" format = \"%\" + \".%df\" %(precision)\n",
|
| 624 |
+
" sVal = format %(val)\n",
|
| 625 |
+
" else:\n",
|
| 626 |
+
" sVal = str(val)\n",
|
| 627 |
+
" return sVal\n",
|
| 628 |
+
"\n",
|
| 629 |
+
"def toStrFromList(values, precision, delim=\",\"):\n",
|
| 630 |
+
" \"\"\"\n",
|
| 631 |
+
" converts list of any type to delim separated string\n",
|
| 632 |
+
"\n",
|
| 633 |
+
" Parameters\n",
|
| 634 |
+
" values : list data\n",
|
| 635 |
+
" precision ; precision for float value\n",
|
| 636 |
+
" delim : delemeter\n",
|
| 637 |
+
" \"\"\"\n",
|
| 638 |
+
" sValues = list(map(lambda v: toStr(v, precision), values))\n",
|
| 639 |
+
" return delim.join(sValues)\n",
|
| 640 |
+
"\n",
|
| 641 |
+
"def toIntList(values):\n",
|
| 642 |
+
" \"\"\"\n",
|
| 643 |
+
" convert to int list\n",
|
| 644 |
+
"\n",
|
| 645 |
+
" Parameters\n",
|
| 646 |
+
" values : list data\n",
|
| 647 |
+
" \"\"\"\n",
|
| 648 |
+
" return list(map(lambda va: int(va), values))\n",
|
| 649 |
+
"\n",
|
| 650 |
+
"def toFloatList(values):\n",
|
| 651 |
+
" \"\"\"\n",
|
| 652 |
+
" convert to float list\n",
|
| 653 |
+
"\n",
|
| 654 |
+
" Parameters\n",
|
| 655 |
+
" values : list data\n",
|
| 656 |
+
" \"\"\"\n",
|
| 657 |
+
" return list(map(lambda va: float(va), values))\n",
|
| 658 |
+
"\n",
|
| 659 |
+
"def toStrList(values, precision=None):\n",
|
| 660 |
+
" \"\"\"\n",
|
| 661 |
+
" convert to string list\n",
|
| 662 |
+
"\n",
|
| 663 |
+
" Parameters\n",
|
| 664 |
+
" values : list data\n",
|
| 665 |
+
" precision ; precision for float value\n",
|
| 666 |
+
" \"\"\"\n",
|
| 667 |
+
" return list(map(lambda va: toStr(va, precision), values))\n",
|
| 668 |
+
"\n",
|
| 669 |
+
"def toIntFromBoolean(value):\n",
|
| 670 |
+
" \"\"\"\n",
|
| 671 |
+
" convert to int\n",
|
| 672 |
+
"\n",
|
| 673 |
+
" Parameters\n",
|
| 674 |
+
" value : boolean value\n",
|
| 675 |
+
" \"\"\"\n",
|
| 676 |
+
" ival = 1 if value else 0\n",
|
| 677 |
+
" return ival\n",
|
| 678 |
+
"\n",
|
| 679 |
+
"def typedValue(val, dtype=None):\n",
|
| 680 |
+
" \"\"\"\n",
|
| 681 |
+
" return typed value given string, discovers data type if not specified\n",
|
| 682 |
+
"\n",
|
| 683 |
+
" Parameters\n",
|
| 684 |
+
" val : value\n",
|
| 685 |
+
" dtype : data type\n",
|
| 686 |
+
" \"\"\"\n",
|
| 687 |
+
" tVal = None\n",
|
| 688 |
+
"\n",
|
| 689 |
+
" if dtype is not None:\n",
|
| 690 |
+
" if dtype == \"num\":\n",
|
| 691 |
+
" dtype = \"int\" if dtype.find(\".\") == -1 else \"float\"\n",
|
| 692 |
+
"\n",
|
| 693 |
+
" if dtype == \"int\":\n",
|
| 694 |
+
" tVal = int(val)\n",
|
| 695 |
+
" elif dtype == \"float\":\n",
|
| 696 |
+
" tVal = float(val)\n",
|
| 697 |
+
" elif dtype == \"bool\":\n",
|
| 698 |
+
" tVal = bool(val)\n",
|
| 699 |
+
" else:\n",
|
| 700 |
+
" tVal = val\n",
|
| 701 |
+
" else:\n",
|
| 702 |
+
" if type(val) == str:\n",
|
| 703 |
+
" lVal = val.lower()\n",
|
| 704 |
+
"\n",
|
| 705 |
+
" #int\n",
|
| 706 |
+
" done = True\n",
|
| 707 |
+
" try:\n",
|
| 708 |
+
" tVal = int(val)\n",
|
| 709 |
+
" except ValueError:\n",
|
| 710 |
+
" done = False\n",
|
| 711 |
+
"\n",
|
| 712 |
+
" #float\n",
|
| 713 |
+
" if not done:\n",
|
| 714 |
+
" done = True\n",
|
| 715 |
+
" try:\n",
|
| 716 |
+
" tVal = float(val)\n",
|
| 717 |
+
" except ValueError:\n",
|
| 718 |
+
" done = False\n",
|
| 719 |
+
"\n",
|
| 720 |
+
" #boolean\n",
|
| 721 |
+
" if not done:\n",
|
| 722 |
+
" done = True\n",
|
| 723 |
+
" if lVal == \"true\":\n",
|
| 724 |
+
" tVal = True\n",
|
| 725 |
+
" elif lVal == \"false\":\n",
|
| 726 |
+
" tVal = False\n",
|
| 727 |
+
" else:\n",
|
| 728 |
+
" done = False\n",
|
| 729 |
+
" #None\t\t\n",
|
| 730 |
+
" if not done:\n",
|
| 731 |
+
" if lVal == \"none\":\n",
|
| 732 |
+
" tVal = None\n",
|
| 733 |
+
" else:\n",
|
| 734 |
+
" tVal = val\n",
|
| 735 |
+
" else:\n",
|
| 736 |
+
" tVal = val\n",
|
| 737 |
+
"\n",
|
| 738 |
+
" return tVal\n",
|
| 739 |
+
"\n",
|
| 740 |
+
"def getAllFiles(dirPath):\n",
|
| 741 |
+
" \"\"\"\n",
|
| 742 |
+
" get all files recursively\n",
|
| 743 |
+
"\n",
|
| 744 |
+
" Parameters\n",
|
| 745 |
+
" dirPath : directory path\n",
|
| 746 |
+
" \"\"\"\n",
|
| 747 |
+
" filePaths = []\n",
|
| 748 |
+
" for (thisDir, subDirs, fileNames) in os.walk(dirPath):\n",
|
| 749 |
+
" for fileName in fileNames:\n",
|
| 750 |
+
" filePaths.append(os.path.join(thisDir, fileName))\n",
|
| 751 |
+
" filePaths.sort()\n",
|
| 752 |
+
" return filePaths\n",
|
| 753 |
+
"\n",
|
| 754 |
+
"def getFileContent(fpath, verbose=False):\n",
|
| 755 |
+
" \"\"\"\n",
|
| 756 |
+
" get file contents in directory\n",
|
| 757 |
+
"\n",
|
| 758 |
+
" Parameters\n",
|
| 759 |
+
" fpath ; directory path\n",
|
| 760 |
+
" verbose : verbosity flag\n",
|
| 761 |
+
" \"\"\"\n",
|
| 762 |
+
" # dcument list\n",
|
| 763 |
+
" docComplete = []\n",
|
| 764 |
+
" filePaths = getAllFiles(fpath)\n",
|
| 765 |
+
"\n",
|
| 766 |
+
" # read files\n",
|
| 767 |
+
" for filePath in filePaths:\n",
|
| 768 |
+
" if verbose:\n",
|
| 769 |
+
" print(\"next file \" + filePath)\n",
|
| 770 |
+
" with open(filePath, 'r') as contentFile:\n",
|
| 771 |
+
" content = contentFile.read()\n",
|
| 772 |
+
" docComplete.append(content)\n",
|
| 773 |
+
" return (docComplete, filePaths)\n",
|
| 774 |
+
"\n",
|
| 775 |
+
"def getOneFileContent(fpath):\n",
|
| 776 |
+
" \"\"\"\n",
|
| 777 |
+
" get one file contents\n",
|
| 778 |
+
"\n",
|
| 779 |
+
" Parameters\n",
|
| 780 |
+
" fpath : file path\n",
|
| 781 |
+
" \"\"\"\n",
|
| 782 |
+
" with open(fpath, 'r') as contentFile:\n",
|
| 783 |
+
" docStr = contentFile.read()\n",
|
| 784 |
+
" return docStr\n",
|
| 785 |
+
"\n",
|
| 786 |
+
"def getFileLines(dirPath, delim=\",\"):\n",
|
| 787 |
+
" \"\"\"\n",
|
| 788 |
+
" get lines from a file\n",
|
| 789 |
+
"\n",
|
| 790 |
+
" Parameters\n",
|
| 791 |
+
" dirPath : file path\n",
|
| 792 |
+
" delim : delemeter\n",
|
| 793 |
+
" \"\"\"\n",
|
| 794 |
+
" lines = list()\n",
|
| 795 |
+
" for li in fileRecGen(dirPath, delim):\n",
|
| 796 |
+
" lines.append(li)\n",
|
| 797 |
+
" return lines\n",
|
| 798 |
+
"\n",
|
| 799 |
+
"def getFileSampleLines(dirPath, percen, delim=\",\"):\n",
|
| 800 |
+
" \"\"\"\n",
|
| 801 |
+
" get sampled lines from a file\n",
|
| 802 |
+
"\n",
|
| 803 |
+
" Parameters\n",
|
| 804 |
+
" dirPath : file path\n",
|
| 805 |
+
" percen : sampling percentage\n",
|
| 806 |
+
" delim : delemeter\n",
|
| 807 |
+
" \"\"\"\n",
|
| 808 |
+
" lines = list()\n",
|
| 809 |
+
" for li in fileRecGen(dirPath, delim):\n",
|
| 810 |
+
" if randint(0, 100) < percen:\n",
|
| 811 |
+
" lines.append(li)\n",
|
| 812 |
+
" return lines\n",
|
| 813 |
+
"\n",
|
| 814 |
+
"def getFileColumnAsString(dirPath, index, delim=\",\"):\n",
|
| 815 |
+
" \"\"\"\n",
|
| 816 |
+
" get string column from a file\n",
|
| 817 |
+
"\n",
|
| 818 |
+
" Parameters\n",
|
| 819 |
+
" dirPath : file path\n",
|
| 820 |
+
" index : index\n",
|
| 821 |
+
" delim : delemeter\n",
|
| 822 |
+
" \"\"\"\n",
|
| 823 |
+
" fields = list()\n",
|
| 824 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 825 |
+
" fields.append(rec[index])\n",
|
| 826 |
+
" #print(fields)\t\n",
|
| 827 |
+
" return fields\n",
|
| 828 |
+
"\n",
|
| 829 |
+
"def getFileColumnsAsString(dirPath, indexes, delim=\",\"):\n",
|
| 830 |
+
" \"\"\"\n",
|
| 831 |
+
" get multiple string columns from a file\n",
|
| 832 |
+
"\n",
|
| 833 |
+
" Parameters\n",
|
| 834 |
+
" dirPath : file path\n",
|
| 835 |
+
" indexes : indexes of columns\n",
|
| 836 |
+
" delim : delemeter\n",
|
| 837 |
+
" \"\"\"\n",
|
| 838 |
+
" nindex = len(indexes)\n",
|
| 839 |
+
" columns = list(map(lambda i : list(), range(nindex)))\n",
|
| 840 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 841 |
+
" for i in range(nindex):\n",
|
| 842 |
+
" columns[i].append(rec[indexes[i]])\n",
|
| 843 |
+
" return columns\n",
|
| 844 |
+
"\n",
|
| 845 |
+
"def getFileColumnAsFloat(dirPath, index, delim=\",\"):\n",
|
| 846 |
+
" \"\"\"\n",
|
| 847 |
+
" get float fileds from a file\n",
|
| 848 |
+
"\n",
|
| 849 |
+
" Parameters\n",
|
| 850 |
+
" dirPath : file path\n",
|
| 851 |
+
" index : index\n",
|
| 852 |
+
" delim : delemeter\n",
|
| 853 |
+
" \"\"\"\n",
|
| 854 |
+
" #print(\"{} {}\".format(dirPath, index))\n",
|
| 855 |
+
" fields = getFileColumnAsString(dirPath, index, delim)\n",
|
| 856 |
+
" return list(map(lambda v:float(v), fields))\n",
|
| 857 |
+
"\n",
|
| 858 |
+
"def getFileColumnAsInt(dirPath, index, delim=\",\"):\n",
|
| 859 |
+
" \"\"\"\n",
|
| 860 |
+
" get float fileds from a file\n",
|
| 861 |
+
"\n",
|
| 862 |
+
" Parameters\n",
|
| 863 |
+
" dirPath : file path\n",
|
| 864 |
+
" index : index\n",
|
| 865 |
+
" delim : delemeter\n",
|
| 866 |
+
" \"\"\"\n",
|
| 867 |
+
" fields = getFileColumnAsString(dirPath, index, delim)\n",
|
| 868 |
+
" return list(map(lambda v:int(v), fields))\n",
|
| 869 |
+
"\n",
|
| 870 |
+
"def getFileAsIntMatrix(dirPath, columns, delim=\",\"):\n",
|
| 871 |
+
" \"\"\"\n",
|
| 872 |
+
" extracts int matrix from csv file given column indices with each row being concatenation of \n",
|
| 873 |
+
" extracted column values row size = num of columns\n",
|
| 874 |
+
"\n",
|
| 875 |
+
" Parameters\n",
|
| 876 |
+
" dirPath : file path\n",
|
| 877 |
+
" columns : indexes of columns\n",
|
| 878 |
+
" delim : delemeter\n",
|
| 879 |
+
" \"\"\"\n",
|
| 880 |
+
" mat = list()\n",
|
| 881 |
+
" for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
|
| 882 |
+
" mat.append(asIntList(rec))\n",
|
| 883 |
+
" return mat\n",
|
| 884 |
+
"\n",
|
| 885 |
+
"def getFileAsFloatMatrix(dirPath, columns, delim=\",\"):\n",
|
| 886 |
+
" \"\"\"\n",
|
| 887 |
+
" extracts float matrix from csv file given column indices with each row being concatenation of \n",
|
| 888 |
+
" extracted column values row size = num of columns\n",
|
| 889 |
+
" Parameters\n",
|
| 890 |
+
" dirPath : file path\n",
|
| 891 |
+
" columns : indexes of columns\n",
|
| 892 |
+
" delim : delemeter\n",
|
| 893 |
+
" \"\"\"\n",
|
| 894 |
+
" mat = list()\n",
|
| 895 |
+
" for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
|
| 896 |
+
" mat.append(asFloatList(rec))\n",
|
| 897 |
+
" return mat\n",
|
| 898 |
+
"\n",
|
| 899 |
+
"def getFileAsFloatColumn(dirPath):\n",
|
| 900 |
+
" \"\"\"\n",
|
| 901 |
+
" grt float list from a file with one float per row\n",
|
| 902 |
+
" Parameters\n",
|
| 903 |
+
" dirPath : file path\n",
|
| 904 |
+
" \"\"\"\n",
|
| 905 |
+
" flist = list()\n",
|
| 906 |
+
" for rec in fileRecGen(dirPath, None):\n",
|
| 907 |
+
" flist.append(float(rec))\n",
|
| 908 |
+
" return flist\n",
|
| 909 |
+
"\n",
|
| 910 |
+
"def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=\",\"):\n",
|
| 911 |
+
" \"\"\"\n",
|
| 912 |
+
" extracts float matrix from csv file given row filter and column indices with each row being \n",
|
| 913 |
+
" concatenation of extracted column values row size = num of columns\n",
|
| 914 |
+
" Parameters\n",
|
| 915 |
+
" dirPath : file path\n",
|
| 916 |
+
" columns : indexes of columns\n",
|
| 917 |
+
" filt : row filter lambda\n",
|
| 918 |
+
" delim : delemeter\n",
|
| 919 |
+
" \"\"\"\n",
|
| 920 |
+
" mat = list()\n",
|
| 921 |
+
" for rec in fileFiltSelFieldsRecGen(dirPath, filt, columns, delim):\n",
|
| 922 |
+
" mat.append(asFloatList(rec))\n",
|
| 923 |
+
" return mat\n",
|
| 924 |
+
"\n",
|
| 925 |
+
"def getFileAsTypedRecords(dirPath, types, delim=\",\"):\n",
|
| 926 |
+
" \"\"\"\n",
|
| 927 |
+
" extracts typed records from csv file with each row being concatenation of \n",
|
| 928 |
+
" extracted column values \n",
|
| 929 |
+
" Parameters\n",
|
| 930 |
+
" dirPath : file path\n",
|
| 931 |
+
" types : data types\n",
|
| 932 |
+
" delim : delemeter\n",
|
| 933 |
+
" \"\"\"\n",
|
| 934 |
+
" (dtypes, cvalues) = extractTypesFromString(types)\n",
|
| 935 |
+
" tdata = list()\n",
|
| 936 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 937 |
+
" trec = list()\n",
|
| 938 |
+
" for index, value in enumerate(rec):\n",
|
| 939 |
+
" value = __convToTyped(index, value, dtypes)\n",
|
| 940 |
+
" trec.append(value)\n",
|
| 941 |
+
" tdata.append(trec)\n",
|
| 942 |
+
" return tdata\n",
|
| 943 |
+
"\n",
|
| 944 |
+
"\n",
|
| 945 |
+
"def getFileColsAsTypedRecords(dirPath, columns, types, delim=\",\"):\n",
|
| 946 |
+
" \"\"\"\n",
|
| 947 |
+
" extracts typed records from csv file given column indices with each row being concatenation of \n",
|
| 948 |
+
" extracted column values \n",
|
| 949 |
+
" Parameters\n",
|
| 950 |
+
" Parameters\n",
|
| 951 |
+
" dirPath : file path\n",
|
| 952 |
+
" columns : column indexes\n",
|
| 953 |
+
" types : data types\n",
|
| 954 |
+
" delim : delemeter\n",
|
| 955 |
+
" \"\"\"\n",
|
| 956 |
+
" (dtypes, cvalues) = extractTypesFromString(types)\n",
|
| 957 |
+
" tdata = list()\n",
|
| 958 |
+
" for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
|
| 959 |
+
" trec = list()\n",
|
| 960 |
+
" for indx, value in enumerate(rec):\n",
|
| 961 |
+
" tindx = columns[indx]\n",
|
| 962 |
+
" value = __convToTyped(tindx, value, dtypes)\n",
|
| 963 |
+
" trec.append(value)\n",
|
| 964 |
+
" tdata.append(trec)\n",
|
| 965 |
+
" return tdata\n",
|
| 966 |
+
"\n",
|
| 967 |
+
"def getFileColumnsMinMax(dirPath, columns, dtype, delim=\",\"):\n",
|
| 968 |
+
" \"\"\"\n",
|
| 969 |
+
" extracts numeric matrix from csv file given column indices. For each column return min and max\n",
|
| 970 |
+
" Parameters\n",
|
| 971 |
+
" dirPath : file path\n",
|
| 972 |
+
" columns : column indexes\n",
|
| 973 |
+
" dtype : data type\n",
|
| 974 |
+
" delim : delemeter\n",
|
| 975 |
+
" \"\"\"\n",
|
| 976 |
+
" dtypes = list(map(lambda c : str(c) + \":\" + dtype, columns))\n",
|
| 977 |
+
" dtypes = \",\".join(dtypes)\n",
|
| 978 |
+
" #print(dtypes)\n",
|
| 979 |
+
"\n",
|
| 980 |
+
" tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim)\n",
|
| 981 |
+
" minMax = list()\n",
|
| 982 |
+
" ncola = len(tdata[0])\n",
|
| 983 |
+
" ncole = len(columns)\n",
|
| 984 |
+
" assertEqual(ncola, ncole, \"actual no of columns different from expected\")\n",
|
| 985 |
+
"\n",
|
| 986 |
+
" for ci in range(ncole):\t\n",
|
| 987 |
+
" vmin = sys.float_info.max\n",
|
| 988 |
+
" vmax = sys.float_info.min\n",
|
| 989 |
+
" for r in tdata:\n",
|
| 990 |
+
" cv = r[ci]\n",
|
| 991 |
+
" vmin = cv if cv < vmin else vmin\n",
|
| 992 |
+
" vmax = cv if cv > vmax else vmax\n",
|
| 993 |
+
" mm = (vmin, vmax, vmax - vmin)\n",
|
| 994 |
+
" minMax.append(mm)\n",
|
| 995 |
+
"\n",
|
| 996 |
+
" return minMax\n",
|
| 997 |
+
"\n",
|
| 998 |
+
"\n",
|
| 999 |
+
"def getRecAsTypedRecord(rec, types, delim=None):\n",
|
| 1000 |
+
" \"\"\"\n",
|
| 1001 |
+
" converts record to typed records \n",
|
| 1002 |
+
" Parameters\n",
|
| 1003 |
+
" rec : delemeter separate string or list of string\n",
|
| 1004 |
+
" types : field data types\n",
|
| 1005 |
+
" delim : delemeter\n",
|
| 1006 |
+
" \"\"\"\t\n",
|
| 1007 |
+
" if delim is not None:\n",
|
| 1008 |
+
" rec = rec.split(delim)\n",
|
| 1009 |
+
" (dtypes, cvalues) = extractTypesFromString(types)\n",
|
| 1010 |
+
" #print(types)\n",
|
| 1011 |
+
" #print(dtypes)\n",
|
| 1012 |
+
" trec = list()\n",
|
| 1013 |
+
" for ind, value in enumerate(rec):\n",
|
| 1014 |
+
" tvalue = __convToTyped(ind, value, dtypes)\n",
|
| 1015 |
+
" trec.append(tvalue)\n",
|
| 1016 |
+
" return trec\n",
|
| 1017 |
+
"\n",
|
| 1018 |
+
"def __convToTyped(index, value, dtypes):\n",
|
| 1019 |
+
" \"\"\"\n",
|
| 1020 |
+
" convert to typed value \n",
|
| 1021 |
+
" Parameters\n",
|
| 1022 |
+
" index : index in type list\n",
|
| 1023 |
+
" value : data value\n",
|
| 1024 |
+
" dtypes : data type list\n",
|
| 1025 |
+
" \"\"\"\n",
|
| 1026 |
+
" #print(index, value)\n",
|
| 1027 |
+
" dtype = dtypes[index]\n",
|
| 1028 |
+
" tvalue = value\n",
|
| 1029 |
+
" if dtype == \"int\":\n",
|
| 1030 |
+
" tvalue = int(value)\n",
|
| 1031 |
+
" elif dtype == \"float\":\n",
|
| 1032 |
+
" tvalue = float(value)\n",
|
| 1033 |
+
" return tvalue\n",
|
| 1034 |
+
"\n",
|
| 1035 |
+
"\n",
|
| 1036 |
+
"\n",
|
| 1037 |
+
"def extractTypesFromString(types):\n",
|
| 1038 |
+
" \"\"\"\n",
|
| 1039 |
+
" extracts column data types and set values for categorical variables \n",
|
| 1040 |
+
" Parameters\n",
|
| 1041 |
+
" types : encoded type information\n",
|
| 1042 |
+
" \"\"\"\n",
|
| 1043 |
+
" ftypes = types.split(\",\")\n",
|
| 1044 |
+
" dtypes = dict()\n",
|
| 1045 |
+
" cvalues = dict()\n",
|
| 1046 |
+
" for ftype in ftypes:\n",
|
| 1047 |
+
" items = ftype.split(\":\") \n",
|
| 1048 |
+
" cindex = int(items[0])\n",
|
| 1049 |
+
" dtype = items[1]\n",
|
| 1050 |
+
" dtypes[cindex] = dtype\n",
|
| 1051 |
+
" if len(items) == 3:\n",
|
| 1052 |
+
" sitems = items[2].split()\n",
|
| 1053 |
+
" cvalues[cindex] = sitems\n",
|
| 1054 |
+
" return (dtypes, cvalues)\n",
|
| 1055 |
+
"\n",
|
| 1056 |
+
"def getMultipleFileAsInttMatrix(dirPathWithCol, delim=\",\"):\n",
|
| 1057 |
+
" \"\"\"\n",
|
| 1058 |
+
" extracts int matrix from from csv files given column index for each file. \n",
|
| 1059 |
+
" num of columns = number of rows in each file and num of rows = number of files\n",
|
| 1060 |
+
" Parameters\n",
|
| 1061 |
+
" dirPathWithCol: list of file path and collumn index pair\n",
|
| 1062 |
+
" delim : delemeter\n",
|
| 1063 |
+
" \"\"\"\n",
|
| 1064 |
+
" mat = list()\n",
|
| 1065 |
+
" minLen = -1\n",
|
| 1066 |
+
" for path, col in dirPathWithCol:\n",
|
| 1067 |
+
" colVals = getFileColumnAsInt(path, col, delim)\n",
|
| 1068 |
+
" if minLen < 0 or len(colVals) < minLen:\n",
|
| 1069 |
+
" minLen = len(colVals)\n",
|
| 1070 |
+
" mat.append(colVals)\n",
|
| 1071 |
+
"\n",
|
| 1072 |
+
" #make all same length\n",
|
| 1073 |
+
" mat = list(map(lambda li:li[:minLen], mat))\n",
|
| 1074 |
+
" return mat\n",
|
| 1075 |
+
"\n",
|
| 1076 |
+
"def getMultipleFileAsFloatMatrix(dirPathWithCol, delim=\",\"):\n",
|
| 1077 |
+
" \"\"\"\n",
|
| 1078 |
+
" extracts float matrix from from csv files given column index for each file. \n",
|
| 1079 |
+
" num of columns = number of rows in each file and num of rows = number of files\n",
|
| 1080 |
+
" Parameters\n",
|
| 1081 |
+
" dirPathWithCol: list of file path and collumn index pair\n",
|
| 1082 |
+
" delim : delemeter\n",
|
| 1083 |
+
" \"\"\"\n",
|
| 1084 |
+
" mat = list()\n",
|
| 1085 |
+
" minLen = -1\n",
|
| 1086 |
+
" for path, col in dirPathWithCol:\n",
|
| 1087 |
+
" colVals = getFileColumnAsFloat(path, col, delim)\n",
|
| 1088 |
+
" if minLen < 0 or len(colVals) < minLen:\n",
|
| 1089 |
+
" minLen = len(colVals)\n",
|
| 1090 |
+
" mat.append(colVals)\n",
|
| 1091 |
+
"\n",
|
| 1092 |
+
" #make all same length\n",
|
| 1093 |
+
" mat = list(map(lambda li:li[:minLen], mat))\n",
|
| 1094 |
+
" return mat\n",
|
| 1095 |
+
"\n",
|
| 1096 |
+
"def writeStrListToFile(ldata, filePath, delem=\",\"):\n",
|
| 1097 |
+
" \"\"\"\n",
|
| 1098 |
+
" writes list of dlem separated string or list of list of string to afile\n",
|
| 1099 |
+
"\n",
|
| 1100 |
+
" Parameters\n",
|
| 1101 |
+
" ldata : list data\n",
|
| 1102 |
+
" filePath : file path\n",
|
| 1103 |
+
" delim : delemeter\n",
|
| 1104 |
+
" \"\"\"\n",
|
| 1105 |
+
" with open(filePath, \"w\") as fh:\n",
|
| 1106 |
+
" for r in ldata:\n",
|
| 1107 |
+
" if type(r) == list:\n",
|
| 1108 |
+
" r = delem.join(r)\n",
|
| 1109 |
+
" fh.write(r + \"\\n\")\n",
|
| 1110 |
+
"\n",
|
| 1111 |
+
"def writeFloatListToFile(ldata, prec, filePath):\n",
|
| 1112 |
+
" \"\"\"\n",
|
| 1113 |
+
" writes float list to file, one value per line\n",
|
| 1114 |
+
"\n",
|
| 1115 |
+
" Parameters\n",
|
| 1116 |
+
" ldata : list data\n",
|
| 1117 |
+
" prec : precision\n",
|
| 1118 |
+
" filePath : file path\n",
|
| 1119 |
+
" \"\"\"\n",
|
| 1120 |
+
" with open(filePath, \"w\") as fh:\n",
|
| 1121 |
+
" for d in ldata:\n",
|
| 1122 |
+
" fh.write(formatFloat(prec, d) + \"\\n\")\n",
|
| 1123 |
+
"\n",
|
| 1124 |
+
"\n",
|
| 1125 |
+
"def takeFirst(elems):\n",
|
| 1126 |
+
" \"\"\"\n",
|
| 1127 |
+
" return fisrt item\n",
|
| 1128 |
+
" Parameters\n",
|
| 1129 |
+
" elems : list of data \n",
|
| 1130 |
+
" \"\"\"\n",
|
| 1131 |
+
" return elems[0]\n",
|
| 1132 |
+
"\n",
|
| 1133 |
+
"def takeSecond(elems):\n",
|
| 1134 |
+
" \"\"\"\n",
|
| 1135 |
+
" return 2nd element\n",
|
| 1136 |
+
" Parameters\n",
|
| 1137 |
+
" elems : list of data \n",
|
| 1138 |
+
" \"\"\"\n",
|
| 1139 |
+
" return elems[1]\n",
|
| 1140 |
+
"\n",
|
| 1141 |
+
"def takeThird(elems):\n",
|
| 1142 |
+
" \"\"\"\n",
|
| 1143 |
+
" returns 3rd element\n",
|
| 1144 |
+
" Parameters\n",
|
| 1145 |
+
" elems : list of data \n",
|
| 1146 |
+
" \"\"\"\n",
|
| 1147 |
+
" return elems[2]\n",
|
| 1148 |
+
"\n",
|
| 1149 |
+
"def addToKeyedCounter(dCounter, key, count=1):\n",
|
| 1150 |
+
" \"\"\"\n",
|
| 1151 |
+
" add to to keyed counter\n",
|
| 1152 |
+
" Parameters\n",
|
| 1153 |
+
" dCounter : dictionary of counters\n",
|
| 1154 |
+
" key : dictionary key\n",
|
| 1155 |
+
" count : count to add\n",
|
| 1156 |
+
" \"\"\"\n",
|
| 1157 |
+
" curCount = dCounter.get(key, 0)\n",
|
| 1158 |
+
" dCounter[key] = curCount + count\n",
|
| 1159 |
+
"\n",
|
| 1160 |
+
"def incrKeyedCounter(dCounter, key):\n",
|
| 1161 |
+
" \"\"\"\n",
|
| 1162 |
+
" increment keyed counter\n",
|
| 1163 |
+
" Parameters\n",
|
| 1164 |
+
" dCounter : dictionary of counters\n",
|
| 1165 |
+
" key : dictionary key\n",
|
| 1166 |
+
" \"\"\"\n",
|
| 1167 |
+
" addToKeyedCounter(dCounter, key, 1)\n",
|
| 1168 |
+
"\n",
|
| 1169 |
+
"def appendKeyedList(dList, key, elem):\n",
|
| 1170 |
+
" \"\"\"\n",
|
| 1171 |
+
" keyed list\n",
|
| 1172 |
+
" Parameters\n",
|
| 1173 |
+
" dList : dictionary of lists\n",
|
| 1174 |
+
" key : dictionary key\n",
|
| 1175 |
+
" elem : value to append\n",
|
| 1176 |
+
" \"\"\"\n",
|
| 1177 |
+
" curList = dList.get(key, [])\n",
|
| 1178 |
+
" curList.append(elem)\n",
|
| 1179 |
+
" dList[key] = curList\n",
|
| 1180 |
+
"\n",
|
| 1181 |
+
"def isNumber(st):\n",
|
| 1182 |
+
" \"\"\"\n",
|
| 1183 |
+
" Returns True is string is a number\n",
|
| 1184 |
+
" Parameters\n",
|
| 1185 |
+
" st : string value\n",
|
| 1186 |
+
" \"\"\"\n",
|
| 1187 |
+
" return st.replace('.','',1).isdigit()\n",
|
| 1188 |
+
"\n",
|
| 1189 |
+
"def removeNan(values):\n",
|
| 1190 |
+
" \"\"\"\n",
|
| 1191 |
+
" removes nan from list\n",
|
| 1192 |
+
" Parameters\n",
|
| 1193 |
+
" values : list data\n",
|
| 1194 |
+
" \"\"\"\n",
|
| 1195 |
+
" return list(filter(lambda v: not math.isnan(v), values))\n",
|
| 1196 |
+
"\n",
|
| 1197 |
+
"def fileRecGen(filePath, delim = \",\"):\n",
|
| 1198 |
+
" \"\"\"\n",
|
| 1199 |
+
" file record generator\n",
|
| 1200 |
+
" Parameters\n",
|
| 1201 |
+
" filePath ; file path\n",
|
| 1202 |
+
" delim : delemeter\n",
|
| 1203 |
+
" \"\"\"\n",
|
| 1204 |
+
" with open(filePath, \"r\") as fp:\n",
|
| 1205 |
+
" for line in fp:\t\n",
|
| 1206 |
+
" line = line[:-1]\n",
|
| 1207 |
+
" if delim is not None:\n",
|
| 1208 |
+
" line = line.split(delim)\n",
|
| 1209 |
+
" yield line\n",
|
| 1210 |
+
"\n",
|
| 1211 |
+
"def fileSelFieldsRecGen(dirPath, columns, delim=\",\"):\n",
|
| 1212 |
+
" \"\"\"\n",
|
| 1213 |
+
" file record generator given column indices \n",
|
| 1214 |
+
" Parameters\n",
|
| 1215 |
+
" filePath ; file path\n",
|
| 1216 |
+
" columns : column indexes as int array or coma separated string\n",
|
| 1217 |
+
" delim : delemeter\n",
|
| 1218 |
+
" \"\"\"\n",
|
| 1219 |
+
" if type(columns) == str:\n",
|
| 1220 |
+
" columns = strToIntArray(columns, delim)\n",
|
| 1221 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 1222 |
+
" extracted = extractList(rec, columns)\n",
|
| 1223 |
+
" yield extracted\n",
|
| 1224 |
+
"\n",
|
| 1225 |
+
"def fileFiltRecGen(filePath, filt, delim = \",\"):\n",
|
| 1226 |
+
" \"\"\"\n",
|
| 1227 |
+
" file record generator with row filter applied\n",
|
| 1228 |
+
" Parameters\n",
|
| 1229 |
+
" filePath ; file path\n",
|
| 1230 |
+
" filt : row filter\n",
|
| 1231 |
+
" delim : delemeter\n",
|
| 1232 |
+
" \"\"\"\n",
|
| 1233 |
+
" with open(filePath, \"r\") as fp:\n",
|
| 1234 |
+
" for line in fp:\t\n",
|
| 1235 |
+
" line = line[:-1]\n",
|
| 1236 |
+
" if delim is not None:\n",
|
| 1237 |
+
" line = line.split(delim)\n",
|
| 1238 |
+
" if filt(line):\n",
|
| 1239 |
+
" yield line\n",
|
| 1240 |
+
"\n",
|
| 1241 |
+
"def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = \",\"):\n",
|
| 1242 |
+
" \"\"\"\n",
|
| 1243 |
+
" file record generator with row and column filter applied\n",
|
| 1244 |
+
" Parameters\n",
|
| 1245 |
+
" filePath ; file path\n",
|
| 1246 |
+
" filt : row filter\n",
|
| 1247 |
+
" columns : column indexes as int array or coma separated string\n",
|
| 1248 |
+
" delim : delemeter\n",
|
| 1249 |
+
" \"\"\"\n",
|
| 1250 |
+
" columns = strToIntArray(columns, delim)\n",
|
| 1251 |
+
" with open(filePath, \"r\") as fp:\n",
|
| 1252 |
+
" for line in fp:\t\n",
|
| 1253 |
+
" line = line[:-1]\n",
|
| 1254 |
+
" if delim is not None:\n",
|
| 1255 |
+
" line = line.split(delim)\n",
|
| 1256 |
+
" if filt(line):\n",
|
| 1257 |
+
" selected = extractList(line, columns)\n",
|
| 1258 |
+
" yield selected\n",
|
| 1259 |
+
"\n",
|
| 1260 |
+
"def fileTypedRecGen(filePath, ftypes, delim = \",\"):\n",
|
| 1261 |
+
" \"\"\"\n",
|
| 1262 |
+
" file typed record generator\n",
|
| 1263 |
+
" Parameters\n",
|
| 1264 |
+
" filePath ; file path\n",
|
| 1265 |
+
" ftypes : list of field types\n",
|
| 1266 |
+
" delim : delemeter\n",
|
| 1267 |
+
" \"\"\"\n",
|
| 1268 |
+
" with open(filePath, \"r\") as fp:\n",
|
| 1269 |
+
" for line in fp:\t\n",
|
| 1270 |
+
" line = line[:-1]\n",
|
| 1271 |
+
" line = line.split(delim)\n",
|
| 1272 |
+
" for i in range(0, len(ftypes), 2):\n",
|
| 1273 |
+
" ci = ftypes[i]\n",
|
| 1274 |
+
" dtype = ftypes[i+1]\n",
|
| 1275 |
+
" assertLesser(ci, len(line), \"index out of bound\")\n",
|
| 1276 |
+
" if dtype == \"int\":\n",
|
| 1277 |
+
" line[ci] = int(line[ci])\n",
|
| 1278 |
+
" elif dtype == \"float\":\n",
|
| 1279 |
+
" line[ci] = float(line[ci])\n",
|
| 1280 |
+
" else:\n",
|
| 1281 |
+
" exitWithMsg(\"invalid data type\")\n",
|
| 1282 |
+
" yield line\n",
|
| 1283 |
+
"\n",
|
| 1284 |
+
"def fileMutatedFieldsRecGen(dirPath, mutator, delim=\",\"):\n",
|
| 1285 |
+
" \"\"\"\n",
|
| 1286 |
+
" file record generator with some columns mutated \n",
|
| 1287 |
+
" Parameters\n",
|
| 1288 |
+
" dirPath ; file path\n",
|
| 1289 |
+
" mutator : row field mutator\n",
|
| 1290 |
+
" delim : delemeter\n",
|
| 1291 |
+
" \"\"\"\n",
|
| 1292 |
+
" for rec in fileRecGen(dirPath, delim):\n",
|
| 1293 |
+
" mutated = mutator(rec)\n",
|
| 1294 |
+
" yield mutated\n",
|
| 1295 |
+
"\n",
|
| 1296 |
+
"def tableSelFieldsFilter(tdata, columns):\n",
|
| 1297 |
+
" \"\"\"\n",
|
| 1298 |
+
" gets tabular data for selected columns \n",
|
| 1299 |
+
" Parameters\n",
|
| 1300 |
+
" tdata : tabular data\n",
|
| 1301 |
+
" columns : column indexes\n",
|
| 1302 |
+
" \"\"\"\n",
|
| 1303 |
+
" if areAllFieldsIncluded(tdata[0], columns):\n",
|
| 1304 |
+
" ntdata = tdata\n",
|
| 1305 |
+
" else:\n",
|
| 1306 |
+
" ntdata = list()\n",
|
| 1307 |
+
" for rec in tdata:\n",
|
| 1308 |
+
" #print(rec)\n",
|
| 1309 |
+
" #print(columns)\n",
|
| 1310 |
+
" nrec = extractList(rec, columns)\n",
|
| 1311 |
+
" ntdata.append(nrec)\n",
|
| 1312 |
+
" return ntdata\n",
|
| 1313 |
+
"\n",
|
| 1314 |
+
"\n",
|
| 1315 |
+
"def areAllFieldsIncluded(ldata, columns):\n",
|
| 1316 |
+
" \"\"\"\n",
|
| 1317 |
+
" return True id all indexes are in the columns\n",
|
| 1318 |
+
" Parameters\n",
|
| 1319 |
+
" ldata : list data\n",
|
| 1320 |
+
" columns : column indexes\n",
|
| 1321 |
+
" \"\"\"\n",
|
| 1322 |
+
" return list(range(len(ldata))) == columns\n",
|
| 1323 |
+
"\n",
|
| 1324 |
+
"def asIntList(items):\n",
|
| 1325 |
+
" \"\"\"\n",
|
| 1326 |
+
" returns int list\n",
|
| 1327 |
+
" Parameters\n",
|
| 1328 |
+
" items : list data\n",
|
| 1329 |
+
" \"\"\"\n",
|
| 1330 |
+
" return [int(i) for i in items]\n",
|
| 1331 |
+
"\n",
|
| 1332 |
+
"def asFloatList(items):\n",
|
| 1333 |
+
" \"\"\"\n",
|
| 1334 |
+
" returns float list\n",
|
| 1335 |
+
" Parameters\n",
|
| 1336 |
+
" items : list data\n",
|
| 1337 |
+
" \"\"\"\n",
|
| 1338 |
+
" return [float(i) for i in items]\n",
|
| 1339 |
+
"\n",
|
| 1340 |
+
"def pastTime(interval, unit):\n",
|
| 1341 |
+
" \"\"\"\n",
|
| 1342 |
+
" current and past time\n",
|
| 1343 |
+
" Parameters\n",
|
| 1344 |
+
" interval : time interval\n",
|
| 1345 |
+
" unit: time unit\n",
|
| 1346 |
+
" \"\"\"\n",
|
| 1347 |
+
" curTime = int(time.time())\n",
|
| 1348 |
+
" if unit == \"d\":\n",
|
| 1349 |
+
" pastTime = curTime - interval * secInDay\n",
|
| 1350 |
+
" elif unit == \"h\":\n",
|
| 1351 |
+
" pastTime = curTime - interval * secInHour\n",
|
| 1352 |
+
" elif unit == \"m\":\n",
|
| 1353 |
+
" pastTime = curTime - interval * secInMinute\n",
|
| 1354 |
+
" else:\n",
|
| 1355 |
+
" raise ValueError(\"invalid time unit \" + unit)\n",
|
| 1356 |
+
" return (curTime, pastTime)\n",
|
| 1357 |
+
"\n",
|
| 1358 |
+
"def minuteAlign(ts):\n",
|
| 1359 |
+
" \"\"\"\n",
|
| 1360 |
+
" minute aligned time\t\n",
|
| 1361 |
+
" Parameters\n",
|
| 1362 |
+
" ts : time stamp in sec\n",
|
| 1363 |
+
" \"\"\"\n",
|
| 1364 |
+
" return int((ts / secInMinute)) * secInMinute\n",
|
| 1365 |
+
"\n",
|
| 1366 |
+
"def multMinuteAlign(ts, min):\n",
|
| 1367 |
+
" \"\"\"\n",
|
| 1368 |
+
" multi minute aligned time\t\n",
|
| 1369 |
+
" Parameters\n",
|
| 1370 |
+
" ts : time stamp in sec\n",
|
| 1371 |
+
" min : minute value\n",
|
| 1372 |
+
" \"\"\"\n",
|
| 1373 |
+
" intv = secInMinute * min\n",
|
| 1374 |
+
" return int((ts / intv)) * intv\n",
|
| 1375 |
+
"\n",
|
| 1376 |
+
"def hourAlign(ts):\n",
|
| 1377 |
+
" \"\"\"\n",
|
| 1378 |
+
" hour aligned time\n",
|
| 1379 |
+
" Parameters\n",
|
| 1380 |
+
" ts : time stamp in sec\n",
|
| 1381 |
+
" \"\"\"\n",
|
| 1382 |
+
" return int((ts / secInHour)) * secInHour\n",
|
| 1383 |
+
"\n",
|
| 1384 |
+
"def hourOfDayAlign(ts, hour):\n",
|
| 1385 |
+
" \"\"\"\n",
|
| 1386 |
+
" hour of day aligned time\n",
|
| 1387 |
+
" Parameters\n",
|
| 1388 |
+
" ts : time stamp in sec\n",
|
| 1389 |
+
" hour : hour of day\n",
|
| 1390 |
+
" \"\"\"\n",
|
| 1391 |
+
" day = int(ts / secInDay)\n",
|
| 1392 |
+
" return (24 * day + hour) * secInHour\n",
|
| 1393 |
+
"\n",
|
| 1394 |
+
"def dayAlign(ts):\n",
|
| 1395 |
+
" \"\"\"\n",
|
| 1396 |
+
" day aligned time\n",
|
| 1397 |
+
" Parameters\n",
|
| 1398 |
+
" ts : time stamp in sec\n",
|
| 1399 |
+
" \"\"\"\n",
|
| 1400 |
+
" return int(ts / secInDay) * secInDay\n",
|
| 1401 |
+
"\n",
|
| 1402 |
+
"def timeAlign(ts, unit):\n",
|
| 1403 |
+
" \"\"\"\n",
|
| 1404 |
+
" boundary alignment of time\n",
|
| 1405 |
+
" Parameters\n",
|
| 1406 |
+
" ts : time stamp in sec\n",
|
| 1407 |
+
" unit : unit of time\n",
|
| 1408 |
+
" \"\"\"\n",
|
| 1409 |
+
" alignedTs = 0\n",
|
| 1410 |
+
" if unit == \"s\":\n",
|
| 1411 |
+
" alignedTs = ts\n",
|
| 1412 |
+
" elif unit == \"m\":\n",
|
| 1413 |
+
" alignedTs = minuteAlign(ts)\n",
|
| 1414 |
+
" elif unit == \"h\":\n",
|
| 1415 |
+
" alignedTs = hourAlign(ts)\n",
|
| 1416 |
+
" elif unit == \"d\":\n",
|
| 1417 |
+
" alignedTs = dayAlign(ts)\n",
|
| 1418 |
+
" else:\n",
|
| 1419 |
+
" raise ValueError(\"invalid time unit\")\n",
|
| 1420 |
+
" return alignedTs\n",
|
| 1421 |
+
"\n",
|
| 1422 |
+
"def monthOfYear(ts):\n",
|
| 1423 |
+
" \"\"\"\n",
|
| 1424 |
+
" month of year\n",
|
| 1425 |
+
" Parameters\n",
|
| 1426 |
+
" ts : time stamp in sec\n",
|
| 1427 |
+
" \"\"\"\n",
|
| 1428 |
+
" rem = ts % secInYear\n",
|
| 1429 |
+
" dow = int(rem / secInMonth)\n",
|
| 1430 |
+
" return dow\n",
|
| 1431 |
+
"\n",
|
| 1432 |
+
"def dayOfWeek(ts):\n",
|
| 1433 |
+
" \"\"\"\n",
|
| 1434 |
+
" day of week\n",
|
| 1435 |
+
" Parameters\n",
|
| 1436 |
+
" ts : time stamp in sec\n",
|
| 1437 |
+
" \"\"\"\n",
|
| 1438 |
+
" rem = ts % secInWeek\n",
|
| 1439 |
+
" dow = int(rem / secInDay)\n",
|
| 1440 |
+
" return dow\n",
|
| 1441 |
+
"\n",
|
| 1442 |
+
"def hourOfDay(ts):\n",
|
| 1443 |
+
" \"\"\"\n",
|
| 1444 |
+
" hour of day\n",
|
| 1445 |
+
" Parameters\n",
|
| 1446 |
+
" ts : time stamp in sec\n",
|
| 1447 |
+
" \"\"\"\n",
|
| 1448 |
+
" rem = ts % secInDay\n",
|
| 1449 |
+
" hod = int(rem / secInHour)\n",
|
| 1450 |
+
" return hod\n",
|
| 1451 |
+
"\n",
|
| 1452 |
+
"def processCmdLineArgs(expectedTypes, usage):\n",
|
| 1453 |
+
" \"\"\"\n",
|
| 1454 |
+
" process command line args and returns args as typed values\n",
|
| 1455 |
+
" Parameters\n",
|
| 1456 |
+
" expectedTypes : expected data types of arguments\n",
|
| 1457 |
+
" usage : usage message string\n",
|
| 1458 |
+
" \"\"\"\n",
|
| 1459 |
+
" args = []\n",
|
| 1460 |
+
" numComLineArgs = len(sys.argv)\n",
|
| 1461 |
+
" numExpected = len(expectedTypes)\n",
|
| 1462 |
+
" if (numComLineArgs - 1 == len(expectedTypes)):\n",
|
| 1463 |
+
" try:\n",
|
| 1464 |
+
" for i in range(0, numExpected):\n",
|
| 1465 |
+
" if (expectedTypes[i] == typeInt):\n",
|
| 1466 |
+
" args.append(int(sys.argv[i+1]))\n",
|
| 1467 |
+
" elif (expectedTypes[i] == typeFloat):\n",
|
| 1468 |
+
" args.append(float(sys.argv[i+1]))\n",
|
| 1469 |
+
" elif (expectedTypes[i] == typeString):\n",
|
| 1470 |
+
" args.append(sys.argv[i+1])\n",
|
| 1471 |
+
" except ValueError:\n",
|
| 1472 |
+
" print (\"expected number of command line arguments found but there is type mis match\")\n",
|
| 1473 |
+
" sys.exit(1)\n",
|
| 1474 |
+
" else:\n",
|
| 1475 |
+
" print (\"expected number of command line arguments not found\")\n",
|
| 1476 |
+
" print (usage)\n",
|
| 1477 |
+
" sys.exit(1)\n",
|
| 1478 |
+
" return args\n",
|
| 1479 |
+
"\n",
|
| 1480 |
+
"def mutateString(val, numMutate, ctype):\n",
|
| 1481 |
+
" \"\"\"\n",
|
| 1482 |
+
" mutate string multiple times\n",
|
| 1483 |
+
" Parameters\n",
|
| 1484 |
+
" val : string value\n",
|
| 1485 |
+
" numMutate : num of mutations\n",
|
| 1486 |
+
" ctype : type of character to mutate with\n",
|
| 1487 |
+
" \"\"\"\n",
|
| 1488 |
+
" mutations = set()\n",
|
| 1489 |
+
" count = 0\n",
|
| 1490 |
+
" while count < numMutate:\n",
|
| 1491 |
+
" j = randint(0, len(val)-1)\n",
|
| 1492 |
+
" if j not in mutations:\n",
|
| 1493 |
+
" if ctype == \"alpha\":\n",
|
| 1494 |
+
" ch = selectRandomFromList(alphaTokens)\n",
|
| 1495 |
+
" elif ctype == \"num\":\n",
|
| 1496 |
+
" ch = selectRandomFromList(numTokens)\n",
|
| 1497 |
+
" elif ctype == \"any\":\n",
|
| 1498 |
+
" ch = selectRandomFromList(tokens)\n",
|
| 1499 |
+
" val = val[:j] + ch + val[j+1:]\n",
|
| 1500 |
+
" mutations.add(j)\n",
|
| 1501 |
+
" count += 1\n",
|
| 1502 |
+
" return val\n",
|
| 1503 |
+
"\n",
|
| 1504 |
+
"def mutateList(values, numMutate, vmin, vmax):\n",
|
| 1505 |
+
" \"\"\"\n",
|
| 1506 |
+
" mutate list multiple times\n",
|
| 1507 |
+
" Parameters\n",
|
| 1508 |
+
" values : list value\n",
|
| 1509 |
+
" numMutate : num of mutations\n",
|
| 1510 |
+
" vmin : minimum of value range\n",
|
| 1511 |
+
" vmax : maximum of value range\n",
|
| 1512 |
+
" \"\"\"\n",
|
| 1513 |
+
" mutations = set()\n",
|
| 1514 |
+
" count = 0\n",
|
| 1515 |
+
" while count < numMutate:\n",
|
| 1516 |
+
" j = randint(0, len(values)-1)\n",
|
| 1517 |
+
" if j not in mutations:\n",
|
| 1518 |
+
" values[j] = np.random.uniform(vmin, vmax)\n",
|
| 1519 |
+
" count += 1\n",
|
| 1520 |
+
" return values\n",
|
| 1521 |
+
"\n",
|
| 1522 |
+
"\n",
|
| 1523 |
+
"def swap(values, first, second):\n",
|
| 1524 |
+
" \"\"\"\n",
|
| 1525 |
+
" swap two elements\n",
|
| 1526 |
+
" Parameters\n",
|
| 1527 |
+
" values : list value\n",
|
| 1528 |
+
" first : first swap position\n",
|
| 1529 |
+
" second : second swap position\n",
|
| 1530 |
+
" \"\"\"\n",
|
| 1531 |
+
" t = values[first]\n",
|
| 1532 |
+
" values[first] = values[second]\n",
|
| 1533 |
+
" values[second] = t\n",
|
| 1534 |
+
"\n",
|
| 1535 |
+
"def swapBetweenLists(values1, values2):\n",
|
| 1536 |
+
" \"\"\"\n",
|
| 1537 |
+
" swap two elements between 2 lists\n",
|
| 1538 |
+
" Parameters\n",
|
| 1539 |
+
" values1 : first list of values\n",
|
| 1540 |
+
" values2 : second list of values\n",
|
| 1541 |
+
" \"\"\"\n",
|
| 1542 |
+
" p1 = randint(0, len(values1)-1)\n",
|
| 1543 |
+
" p2 = randint(0, len(values2)-1)\n",
|
| 1544 |
+
" tmp = values1[p1]\n",
|
| 1545 |
+
" values1[p1] = values2[p2]\n",
|
| 1546 |
+
" values2[p2] = tmp\n",
|
| 1547 |
+
"\n",
|
| 1548 |
+
"def safeAppend(values, value):\n",
|
| 1549 |
+
" \"\"\"\n",
|
| 1550 |
+
" append only if not None\n",
|
| 1551 |
+
" Parameters\n",
|
| 1552 |
+
" values : list value\n",
|
| 1553 |
+
" value : value to append\n",
|
| 1554 |
+
" \"\"\"\n",
|
| 1555 |
+
" if value is not None:\n",
|
| 1556 |
+
" values.append(value)\n",
|
| 1557 |
+
"\n",
|
| 1558 |
+
"def getAllIndex(ldata, fldata):\n",
|
| 1559 |
+
" \"\"\"\n",
|
| 1560 |
+
" get ALL indexes of list elements\n",
|
| 1561 |
+
" Parameters\n",
|
| 1562 |
+
" ldata : list data to find index in\n",
|
| 1563 |
+
" fldata : list data for values for index look up\n",
|
| 1564 |
+
" \"\"\"\n",
|
| 1565 |
+
" return list(map(lambda e : fldata.index(e), ldata))\n",
|
| 1566 |
+
"\n",
|
| 1567 |
+
"def findIntersection(lOne, lTwo):\n",
|
| 1568 |
+
" \"\"\"\n",
|
| 1569 |
+
" find intersection elements between 2 lists\n",
|
| 1570 |
+
" Parameters\n",
|
| 1571 |
+
" lOne : first list of data\n",
|
| 1572 |
+
" lTwo : second list of data\n",
|
| 1573 |
+
" \"\"\"\n",
|
| 1574 |
+
" sOne = set(lOne)\n",
|
| 1575 |
+
" sTwo = set(lTwo)\n",
|
| 1576 |
+
" sInt = sOne.intersection(sTwo)\n",
|
| 1577 |
+
" return list(sInt)\n",
|
| 1578 |
+
"\n",
|
| 1579 |
+
"def isIntvOverlapped(rOne, rTwo):\n",
|
| 1580 |
+
" \"\"\"\n",
|
| 1581 |
+
" checks overlap between 2 intervals\n",
|
| 1582 |
+
" Parameters\n",
|
| 1583 |
+
" rOne : first interval boundaries\n",
|
| 1584 |
+
" rTwo : second interval boundaries\n",
|
| 1585 |
+
" \"\"\"\n",
|
| 1586 |
+
" clear = rOne[1] <= rTwo[0] or rOne[0] >= rTwo[1] \n",
|
| 1587 |
+
" return not clear\n",
|
| 1588 |
+
"\n",
|
| 1589 |
+
"def isIntvLess(rOne, rTwo):\n",
|
| 1590 |
+
" \"\"\"\n",
|
| 1591 |
+
" checks if first iterval is less than second\n",
|
| 1592 |
+
" Parameters\n",
|
| 1593 |
+
" rOne : first interval boundaries\n",
|
| 1594 |
+
" rTwo : second interval boundaries\n",
|
| 1595 |
+
" \"\"\"\n",
|
| 1596 |
+
" less = rOne[1] <= rTwo[0] \n",
|
| 1597 |
+
" return less\n",
|
| 1598 |
+
"\n",
|
| 1599 |
+
"def findRank(e, values):\n",
|
| 1600 |
+
" \"\"\"\n",
|
| 1601 |
+
" find rank of value in a list\n",
|
| 1602 |
+
" Parameters\n",
|
| 1603 |
+
" e : value to compare with\n",
|
| 1604 |
+
" values : list data\n",
|
| 1605 |
+
" \"\"\"\n",
|
| 1606 |
+
" count = 1\n",
|
| 1607 |
+
" for ve in values:\n",
|
| 1608 |
+
" if ve < e:\n",
|
| 1609 |
+
" count += 1\n",
|
| 1610 |
+
" return count\n",
|
| 1611 |
+
"\n",
|
| 1612 |
+
"def findRanks(toBeRanked, values):\n",
|
| 1613 |
+
" \"\"\"\n",
|
| 1614 |
+
" find ranks of values in one list in another list\n",
|
| 1615 |
+
" Parameters\n",
|
| 1616 |
+
" toBeRanked : list of values for which ranks are found\n",
|
| 1617 |
+
" values : list in which rank is found : \n",
|
| 1618 |
+
" \"\"\"\n",
|
| 1619 |
+
" return list(map(lambda e: findRank(e, values), toBeRanked))\n",
|
| 1620 |
+
"\n",
|
| 1621 |
+
"def formatFloat(prec, value, label = None):\n",
|
| 1622 |
+
" \"\"\"\n",
|
| 1623 |
+
" formats a float with optional label\n",
|
| 1624 |
+
" Parameters\n",
|
| 1625 |
+
" prec : precision\n",
|
| 1626 |
+
" value : data value\n",
|
| 1627 |
+
" label : label for data\n",
|
| 1628 |
+
" \"\"\"\n",
|
| 1629 |
+
" st = (label + \" \") if label else \"\"\n",
|
| 1630 |
+
" formatter = \"{:.\" + str(prec) + \"f}\" \n",
|
| 1631 |
+
" return st + formatter.format(value)\n",
|
| 1632 |
+
"\n",
|
| 1633 |
+
"def formatAny(value, label = None):\n",
|
| 1634 |
+
" \"\"\"\n",
|
| 1635 |
+
" formats any obkect with optional label\n",
|
| 1636 |
+
" Parameters\n",
|
| 1637 |
+
" value : data value\n",
|
| 1638 |
+
" label : label for data\n",
|
| 1639 |
+
" \"\"\"\n",
|
| 1640 |
+
" st = (label + \" \") if label else \"\"\n",
|
| 1641 |
+
" return st + str(value)\n",
|
| 1642 |
+
"\n",
|
| 1643 |
+
"def printList(values):\n",
|
| 1644 |
+
" \"\"\"\n",
|
| 1645 |
+
" pretty print list\n",
|
| 1646 |
+
" Parameters\n",
|
| 1647 |
+
" values : list of values\n",
|
| 1648 |
+
" \"\"\"\n",
|
| 1649 |
+
" for v in values:\n",
|
| 1650 |
+
" print(v)\n",
|
| 1651 |
+
"\n",
|
| 1652 |
+
"def printMap(values, klab, vlab, precision, offset=16):\n",
|
| 1653 |
+
" \"\"\"\n",
|
| 1654 |
+
" pretty print hash map\n",
|
| 1655 |
+
" Parameters\n",
|
| 1656 |
+
" values : dictionary of values\n",
|
| 1657 |
+
" klab : label for key\n",
|
| 1658 |
+
" vlab : label for value\n",
|
| 1659 |
+
" precision : precision\n",
|
| 1660 |
+
" offset : left justify offset\n",
|
| 1661 |
+
" \"\"\"\n",
|
| 1662 |
+
" print(klab.ljust(offset, \" \") + vlab)\n",
|
| 1663 |
+
" for k in values.keys():\n",
|
| 1664 |
+
" v = values[k]\n",
|
| 1665 |
+
" ks = toStr(k, precision).ljust(offset, \" \")\n",
|
| 1666 |
+
" vs = toStr(v, precision)\n",
|
| 1667 |
+
" print(ks + vs)\n",
|
| 1668 |
+
"\n",
|
| 1669 |
+
"def printPairList(values, lab1, lab2, precision, offset=16):\n",
|
| 1670 |
+
" \"\"\"\n",
|
| 1671 |
+
" pretty print list of pairs\n",
|
| 1672 |
+
" Parameters\n",
|
| 1673 |
+
" values : dictionary of values\n",
|
| 1674 |
+
" lab1 : first label\n",
|
| 1675 |
+
" lab2 : second label\n",
|
| 1676 |
+
" precision : precision\n",
|
| 1677 |
+
" offset : left justify offset\n",
|
| 1678 |
+
" \"\"\"\n",
|
| 1679 |
+
" print(lab1.ljust(offset, \" \") + lab2)\n",
|
| 1680 |
+
" for (v1, v2) in values:\n",
|
| 1681 |
+
" sv1 = toStr(v1, precision).ljust(offset, \" \")\n",
|
| 1682 |
+
" sv2 = toStr(v2, precision)\n",
|
| 1683 |
+
" print(sv1 + sv2)\n",
|
| 1684 |
+
"\n",
|
| 1685 |
+
"def createMap(*values):\n",
|
| 1686 |
+
" \"\"\"\n",
|
| 1687 |
+
" create disctionary with results\n",
|
| 1688 |
+
" Parameters\n",
|
| 1689 |
+
" values : sequence of key value pairs\n",
|
| 1690 |
+
" \"\"\"\n",
|
| 1691 |
+
" result = dict()\n",
|
| 1692 |
+
" for i in range(0, len(values), 2):\n",
|
| 1693 |
+
" result[values[i]] = values[i+1]\n",
|
| 1694 |
+
" return result\n",
|
| 1695 |
+
"\n",
|
| 1696 |
+
"def getColMinMax(table, col):\n",
|
| 1697 |
+
" \"\"\"\n",
|
| 1698 |
+
" return min, max values of a column\n",
|
| 1699 |
+
" Parameters\n",
|
| 1700 |
+
" table : tabular data\n",
|
| 1701 |
+
" col : column index\n",
|
| 1702 |
+
" \"\"\"\n",
|
| 1703 |
+
" vmin = None\n",
|
| 1704 |
+
" vmax = None\n",
|
| 1705 |
+
" for rec in table:\n",
|
| 1706 |
+
" value = rec[col]\n",
|
| 1707 |
+
" if vmin is None:\n",
|
| 1708 |
+
" vmin = value\n",
|
| 1709 |
+
" vmax = value\n",
|
| 1710 |
+
" else:\n",
|
| 1711 |
+
" if value < vmin:\n",
|
| 1712 |
+
" vmin = value\n",
|
| 1713 |
+
" elif value > vmax:\n",
|
| 1714 |
+
" vmax = value\n",
|
| 1715 |
+
" return (vmin, vmax, vmax - vmin)\n",
|
| 1716 |
+
"\n",
|
| 1717 |
+
"def createLogger(name, logFilePath, logLevName):\n",
|
| 1718 |
+
" \"\"\"\n",
|
| 1719 |
+
" creates logger\n",
|
| 1720 |
+
" Parameters\n",
|
| 1721 |
+
" name : logger name\n",
|
| 1722 |
+
" logFilePath : log file path\n",
|
| 1723 |
+
" logLevName : log level\n",
|
| 1724 |
+
" \"\"\"\n",
|
| 1725 |
+
" logger = logging.getLogger(name)\n",
|
| 1726 |
+
" fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4)\n",
|
| 1727 |
+
" logLev = logLevName.lower()\n",
|
| 1728 |
+
" if logLev == \"debug\":\n",
|
| 1729 |
+
" logLevel = logging.DEBUG\n",
|
| 1730 |
+
" elif logLev == \"info\":\n",
|
| 1731 |
+
" logLevel = logging.INFO\n",
|
| 1732 |
+
" elif logLev == \"warning\":\n",
|
| 1733 |
+
" logLevel = logging.WARNING\n",
|
| 1734 |
+
" elif logLev == \"error\":\n",
|
| 1735 |
+
" logLevel = logging.ERROR\n",
|
| 1736 |
+
" elif logLev == \"critical\":\n",
|
| 1737 |
+
" logLevel = logging.CRITICAL\n",
|
| 1738 |
+
" else:\n",
|
| 1739 |
+
" raise ValueError(\"invalid log level name \" + logLevelName)\n",
|
| 1740 |
+
" fHandler.setLevel(logLevel)\n",
|
| 1741 |
+
" fFormat = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n",
|
| 1742 |
+
" fHandler.setFormatter(fFormat)\n",
|
| 1743 |
+
" logger.addHandler(fHandler)\n",
|
| 1744 |
+
" logger.setLevel(logLevel)\n",
|
| 1745 |
+
" return logger\n",
|
| 1746 |
+
"\n",
|
| 1747 |
+
"@contextmanager\n",
|
| 1748 |
+
"def suppressStdout():\n",
|
| 1749 |
+
" \"\"\"\n",
|
| 1750 |
+
" suppress stdout\n",
|
| 1751 |
+
" Parameters\n",
|
| 1752 |
+
" \"\"\"\n",
|
| 1753 |
+
" with open(os.devnull, \"w\") as devnull:\n",
|
| 1754 |
+
" oldStdout = sys.stdout\n",
|
| 1755 |
+
" sys.stdout = devnull\n",
|
| 1756 |
+
" try: \n",
|
| 1757 |
+
" yield\n",
|
| 1758 |
+
" finally:\n",
|
| 1759 |
+
" sys.stdout = oldStdout\n",
|
| 1760 |
+
"\n",
|
| 1761 |
+
"def exitWithMsg(msg):\n",
|
| 1762 |
+
" \"\"\"\n",
|
| 1763 |
+
" print message and exit\n",
|
| 1764 |
+
" Parameters\n",
|
| 1765 |
+
" msg : message\n",
|
| 1766 |
+
" \"\"\"\n",
|
| 1767 |
+
" print(msg + \" -- quitting\")\n",
|
| 1768 |
+
" sys.exit(0)\n",
|
| 1769 |
+
"\n",
|
| 1770 |
+
"def drawLine(data, yscale=None):\n",
|
| 1771 |
+
" \"\"\"\n",
|
| 1772 |
+
" line plot\n",
|
| 1773 |
+
" Parameters\n",
|
| 1774 |
+
" data : list data\n",
|
| 1775 |
+
" yscale : y axis scale\n",
|
| 1776 |
+
" \"\"\"\n",
|
| 1777 |
+
" plt.plot(data)\n",
|
| 1778 |
+
" if yscale:\n",
|
| 1779 |
+
" step = int(yscale / 10)\n",
|
| 1780 |
+
" step = int(step / 10) * 10\n",
|
| 1781 |
+
" plt.yticks(range(0, yscale, step))\n",
|
| 1782 |
+
" plt.show()\n",
|
| 1783 |
+
"\n",
|
| 1784 |
+
"def drawPlot(x, y, xlabel, ylabel):\n",
|
| 1785 |
+
" \"\"\"\n",
|
| 1786 |
+
" line plot\n",
|
| 1787 |
+
" Parameters\n",
|
| 1788 |
+
" x : x values\n",
|
| 1789 |
+
" y : y values\n",
|
| 1790 |
+
" xlabel : x axis label\n",
|
| 1791 |
+
" ylabel : y axis label\n",
|
| 1792 |
+
" \"\"\"\n",
|
| 1793 |
+
" plt.plot(x,y)\n",
|
| 1794 |
+
" plt.xlabel(xlabel)\n",
|
| 1795 |
+
" plt.ylabel(ylabel)\n",
|
| 1796 |
+
" plt.show()\n",
|
| 1797 |
+
"\n",
|
| 1798 |
+
"def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label):\n",
|
| 1799 |
+
" \"\"\"\n",
|
| 1800 |
+
" line plot of 2 lines\n",
|
| 1801 |
+
" Parameters\n",
|
| 1802 |
+
" x : x values\n",
|
| 1803 |
+
" y1 : first y values\n",
|
| 1804 |
+
" y2 : second y values\n",
|
| 1805 |
+
" xlabel : x labbel\n",
|
| 1806 |
+
" ylabel : y label\n",
|
| 1807 |
+
" y1label : first plot label\n",
|
| 1808 |
+
" y2label : second plot label\n",
|
| 1809 |
+
" \"\"\"\n",
|
| 1810 |
+
" plt.plot(x, y1, label = y1label)\n",
|
| 1811 |
+
" plt.plot(x, y2, label = y2label)\n",
|
| 1812 |
+
" plt.xlabel(xlabel)\n",
|
| 1813 |
+
" plt.ylabel(ylabel)\n",
|
| 1814 |
+
" plt.legend()\n",
|
| 1815 |
+
" plt.show()\n",
|
| 1816 |
+
"\n",
|
| 1817 |
+
"def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10):\n",
|
| 1818 |
+
" \"\"\"\n",
|
| 1819 |
+
" draw histogram\n",
|
| 1820 |
+
" Parameters\n",
|
| 1821 |
+
" ldata : list data\n",
|
| 1822 |
+
" myTitle : title\n",
|
| 1823 |
+
" myXlabel : x label\n",
|
| 1824 |
+
" myYlabel : y label \n",
|
| 1825 |
+
" nbins : num of bins\n",
|
| 1826 |
+
" \"\"\"\n",
|
| 1827 |
+
" plt.hist(ldata, bins=nbins, density=True)\n",
|
| 1828 |
+
" plt.title(myTitle)\n",
|
| 1829 |
+
" plt.xlabel(myXlabel)\n",
|
| 1830 |
+
" plt.ylabel(myYlabel)\n",
|
| 1831 |
+
" plt.show()\n",
|
| 1832 |
+
"\n",
|
| 1833 |
+
"def saveObject(obj, filePath):\n",
|
| 1834 |
+
" \"\"\"\n",
|
| 1835 |
+
" saves an object\n",
|
| 1836 |
+
" Parameters\n",
|
| 1837 |
+
" obj : object\n",
|
| 1838 |
+
" filePath : file path for saved object\n",
|
| 1839 |
+
" \"\"\"\n",
|
| 1840 |
+
" with open(filePath, \"wb\") as outfile:\n",
|
| 1841 |
+
" pickle.dump(obj,outfile)\n",
|
| 1842 |
+
"\n",
|
| 1843 |
+
"def restoreObject(filePath):\n",
|
| 1844 |
+
" \"\"\"\n",
|
| 1845 |
+
" restores an object\n",
|
| 1846 |
+
" Parameters\n",
|
| 1847 |
+
" filePath : file path to restore object from\n",
|
| 1848 |
+
" \"\"\"\n",
|
| 1849 |
+
" with open(filePath, \"rb\") as infile:\n",
|
| 1850 |
+
" obj = pickle.load(infile)\n",
|
| 1851 |
+
" return obj\n",
|
| 1852 |
+
"\n",
|
| 1853 |
+
"def isNumeric(data):\n",
|
| 1854 |
+
" \"\"\"\n",
|
| 1855 |
+
" true if all elements int or float\n",
|
| 1856 |
+
" Parameters\n",
|
| 1857 |
+
" data : numeric data list\n",
|
| 1858 |
+
" \"\"\"\n",
|
| 1859 |
+
" if type(data) == list or type(data) == np.ndarray:\n",
|
| 1860 |
+
" col = pd.Series(data)\n",
|
| 1861 |
+
" else:\n",
|
| 1862 |
+
" col = data\n",
|
| 1863 |
+
" return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64\n",
|
| 1864 |
+
"\n",
|
| 1865 |
+
"def isInteger(data):\n",
|
| 1866 |
+
" \"\"\"\n",
|
| 1867 |
+
" true if all elements int \n",
|
| 1868 |
+
" Parameters\n",
|
| 1869 |
+
" data : numeric data list\n",
|
| 1870 |
+
" \"\"\"\n",
|
| 1871 |
+
" if type(data) == list or type(data) == np.ndarray:\n",
|
| 1872 |
+
" col = pd.Series(data)\n",
|
| 1873 |
+
" else:\n",
|
| 1874 |
+
" col = data\n",
|
| 1875 |
+
" return col.dtype == np.int32 or col.dtype == np.int64\n",
|
| 1876 |
+
"\n",
|
| 1877 |
+
"def isFloat(data):\n",
|
| 1878 |
+
" \"\"\"\n",
|
| 1879 |
+
" true if all elements float\n",
|
| 1880 |
+
" Parameters\n",
|
| 1881 |
+
" data : numeric data list\n",
|
| 1882 |
+
" \"\"\"\n",
|
| 1883 |
+
" if type(data) == list or type(data) == np.ndarray:\n",
|
| 1884 |
+
" col = pd.Series(data)\n",
|
| 1885 |
+
" else:\n",
|
| 1886 |
+
" col = data\n",
|
| 1887 |
+
" return col.dtype == np.float32 or col.dtype == np.float64\n",
|
| 1888 |
+
"\n",
|
| 1889 |
+
"def isBinary(data):\n",
|
| 1890 |
+
" \"\"\"\n",
|
| 1891 |
+
" true if all elements either 0 or 1\n",
|
| 1892 |
+
" Parameters\n",
|
| 1893 |
+
" data : binary data\n",
|
| 1894 |
+
" \"\"\"\n",
|
| 1895 |
+
" re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None)\n",
|
| 1896 |
+
" return (re is None)\n",
|
| 1897 |
+
"\n",
|
| 1898 |
+
"def isCategorical(data):\n",
|
| 1899 |
+
" \"\"\"\n",
|
| 1900 |
+
" true if all elements int or string\n",
|
| 1901 |
+
" Parameters\n",
|
| 1902 |
+
" data : data value\n",
|
| 1903 |
+
" \"\"\"\n",
|
| 1904 |
+
" re = next((d for d in data if not (type(d) == int or type(d) == str)), None)\n",
|
| 1905 |
+
" return (re is None)\n",
|
| 1906 |
+
"\n",
|
| 1907 |
+
"def assertEqual(value, veq, msg):\n",
|
| 1908 |
+
" \"\"\"\n",
|
| 1909 |
+
" assert equal to\n",
|
| 1910 |
+
" Parameters\n",
|
| 1911 |
+
" value : value\n",
|
| 1912 |
+
" veq : value to be equated with\n",
|
| 1913 |
+
" msg : error msg\n",
|
| 1914 |
+
" \"\"\"\n",
|
| 1915 |
+
" assert value == veq , msg\n",
|
| 1916 |
+
"\n",
|
| 1917 |
+
"def assertGreater(value, vmin, msg):\n",
|
| 1918 |
+
" \"\"\"\n",
|
| 1919 |
+
" assert greater than \n",
|
| 1920 |
+
" Parameters\n",
|
| 1921 |
+
" value : value\n",
|
| 1922 |
+
" vmin : minimum value\n",
|
| 1923 |
+
" msg : error msg\n",
|
| 1924 |
+
" \"\"\"\n",
|
| 1925 |
+
" assert value > vmin , msg\n",
|
| 1926 |
+
"\n",
|
| 1927 |
+
"def assertGreaterEqual(value, vmin, msg):\n",
|
| 1928 |
+
" \"\"\"\n",
|
| 1929 |
+
" assert greater than \n",
|
| 1930 |
+
" Parameters\n",
|
| 1931 |
+
" value : value\n",
|
| 1932 |
+
" vmin : minimum value\n",
|
| 1933 |
+
" msg : error msg\n",
|
| 1934 |
+
" \"\"\"\n",
|
| 1935 |
+
" assert value >= vmin , msg\n",
|
| 1936 |
+
"\n",
|
| 1937 |
+
"def assertLesser(value, vmax, msg):\n",
|
| 1938 |
+
" \"\"\"\n",
|
| 1939 |
+
" assert less than\n",
|
| 1940 |
+
" Parameters\n",
|
| 1941 |
+
" value : value\n",
|
| 1942 |
+
" vmax : maximum value\n",
|
| 1943 |
+
" msg : error msg\n",
|
| 1944 |
+
" \"\"\"\n",
|
| 1945 |
+
" assert value < vmax , msg\n",
|
| 1946 |
+
"\n",
|
| 1947 |
+
"def assertLesserEqual(value, vmax, msg):\n",
|
| 1948 |
+
" \"\"\"\n",
|
| 1949 |
+
" assert less than\n",
|
| 1950 |
+
" Parameters\n",
|
| 1951 |
+
" value : value\n",
|
| 1952 |
+
" vmax : maximum value\n",
|
| 1953 |
+
" msg : error msg\n",
|
| 1954 |
+
" \"\"\"\n",
|
| 1955 |
+
" assert value <= vmax , msg\n",
|
| 1956 |
+
"\n",
|
| 1957 |
+
"def assertWithinRange(value, vmin, vmax, msg):\n",
|
| 1958 |
+
" \"\"\"\n",
|
| 1959 |
+
" assert within range\n",
|
| 1960 |
+
" Parameters\n",
|
| 1961 |
+
" value : value\n",
|
| 1962 |
+
" vmin : minimum value\n",
|
| 1963 |
+
" vmax : maximum value\n",
|
| 1964 |
+
" msg : error msg\n",
|
| 1965 |
+
" \"\"\"\n",
|
| 1966 |
+
" assert value >= vmin and value <= vmax, msg\n",
|
| 1967 |
+
"\n",
|
| 1968 |
+
"def assertInList(value, values, msg):\n",
|
| 1969 |
+
" \"\"\"\n",
|
| 1970 |
+
" assert contains in a list\n",
|
| 1971 |
+
" Parameters\n",
|
| 1972 |
+
" value ; balue to check for inclusion\n",
|
| 1973 |
+
" values : list data\n",
|
| 1974 |
+
" msg : error msg\n",
|
| 1975 |
+
" \"\"\"\n",
|
| 1976 |
+
" assert value in values, msg\n",
|
| 1977 |
+
"\n",
|
| 1978 |
+
"def maxListDist(l1, l2):\n",
|
| 1979 |
+
" \"\"\"\n",
|
| 1980 |
+
" maximum list element difference between 2 lists\n",
|
| 1981 |
+
" Parameters\n",
|
| 1982 |
+
" l1 : first list data\n",
|
| 1983 |
+
" l2 : second list data\n",
|
| 1984 |
+
" \"\"\"\n",
|
| 1985 |
+
" dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2))))\t\n",
|
| 1986 |
+
" return dist\n",
|
| 1987 |
+
"\n",
|
| 1988 |
+
"def fileLineCount(fPath):\n",
|
| 1989 |
+
" \"\"\" \n",
|
| 1990 |
+
" number of lines ina file \n",
|
| 1991 |
+
" Parameters\n",
|
| 1992 |
+
" fPath : file path\n",
|
| 1993 |
+
" \"\"\"\n",
|
| 1994 |
+
" with open(fPath) as f:\n",
|
| 1995 |
+
" for i, li in enumerate(f):\n",
|
| 1996 |
+
" pass\n",
|
| 1997 |
+
" return (i + 1)\n",
|
| 1998 |
+
"\n",
|
| 1999 |
+
"def getAlphaNumCharCount(sdata):\n",
|
| 2000 |
+
" \"\"\" \n",
|
| 2001 |
+
" number of alphabetic and numeric charcters in a string \n",
|
| 2002 |
+
" Parameters\n",
|
| 2003 |
+
" sdata : string data\n",
|
| 2004 |
+
" \"\"\"\n",
|
| 2005 |
+
" acount = 0\n",
|
| 2006 |
+
" ncount = 0\n",
|
| 2007 |
+
" scount = 0\n",
|
| 2008 |
+
" ocount = 0\n",
|
| 2009 |
+
" assertEqual(type(sdata), str, \"input must be string\")\n",
|
| 2010 |
+
" for c in sdata:\n",
|
| 2011 |
+
" if c.isnumeric():\n",
|
| 2012 |
+
" ncount += 1\n",
|
| 2013 |
+
" elif c.isalpha():\n",
|
| 2014 |
+
" acount += 1\n",
|
| 2015 |
+
" elif c.isspace():\n",
|
| 2016 |
+
" scount += 1\n",
|
| 2017 |
+
" else:\n",
|
| 2018 |
+
" ocount += 1\n",
|
| 2019 |
+
" r = (acount, ncount, ocount)\n",
|
| 2020 |
+
" return r\n",
|
| 2021 |
+
"\n",
|
| 2022 |
+
"class StepFunction:\n",
|
| 2023 |
+
" \"\"\"\n",
|
| 2024 |
+
" step function\n",
|
| 2025 |
+
" Parameters\n",
|
| 2026 |
+
" \"\"\"\n",
|
| 2027 |
+
" def __init__(self, *values):\n",
|
| 2028 |
+
" \"\"\"\n",
|
| 2029 |
+
" initilizer\n",
|
| 2030 |
+
"\n",
|
| 2031 |
+
" Parameters\n",
|
| 2032 |
+
" values : list of tuples, wich each tuple containing 2 x values and corresponding y value\n",
|
| 2033 |
+
" \"\"\"\n",
|
| 2034 |
+
" self.points = values\n",
|
| 2035 |
+
"\n",
|
| 2036 |
+
" def find(self, x):\n",
|
| 2037 |
+
" \"\"\"\n",
|
| 2038 |
+
" finds step function value\n",
|
| 2039 |
+
"\n",
|
| 2040 |
+
" Parameters\n",
|
| 2041 |
+
" x : x value\n",
|
| 2042 |
+
" \"\"\"\n",
|
| 2043 |
+
" found = False\n",
|
| 2044 |
+
" y = 0\n",
|
| 2045 |
+
" for p in self.points:\n",
|
| 2046 |
+
" if (x >= p[0] and x < p[1]):\n",
|
| 2047 |
+
" y = p[2]\n",
|
| 2048 |
+
" found = True\n",
|
| 2049 |
+
" break\n",
|
| 2050 |
+
"\n",
|
| 2051 |
+
" if not found:\n",
|
| 2052 |
+
" l = len(self.points)\n",
|
| 2053 |
+
" if (x < self.points[0][0]):\n",
|
| 2054 |
+
" y = self.points[0][2]\n",
|
| 2055 |
+
" elif (x > self.points[l-1][1]):\n",
|
| 2056 |
+
" y = self.points[l-1][2]\n",
|
| 2057 |
+
" return y\n",
|
| 2058 |
+
"\n",
|
| 2059 |
+
"\n",
|
| 2060 |
+
"class DummyVarGenerator:\n",
|
| 2061 |
+
" \"\"\"\n",
|
| 2062 |
+
" dummy variable generator for categorical variable\n",
|
| 2063 |
+
" \"\"\"\n",
|
| 2064 |
+
" def __init__(self, rowSize, catValues, trueVal, falseVal, delim=None):\n",
|
| 2065 |
+
" \"\"\"\n",
|
| 2066 |
+
" initilizer\n",
|
| 2067 |
+
"\n",
|
| 2068 |
+
" Parameters\n",
|
| 2069 |
+
" rowSize : row size\n",
|
| 2070 |
+
" catValues : dictionary with field index as key and list of categorical values as value\n",
|
| 2071 |
+
" trueVal : true value, typically \"1\"\n",
|
| 2072 |
+
" falseval : false value , typically \"0\"\n",
|
| 2073 |
+
" delim : field delemeter\n",
|
| 2074 |
+
" \"\"\"\n",
|
| 2075 |
+
" self.rowSize = rowSize\n",
|
| 2076 |
+
" self.catValues = catValues\n",
|
| 2077 |
+
" numCatVar = len(catValues)\n",
|
| 2078 |
+
" colCount = 0\n",
|
| 2079 |
+
" for v in self.catValues.values():\n",
|
| 2080 |
+
" colCount += len(v)\n",
|
| 2081 |
+
" self.newRowSize = rowSize - numCatVar + colCount\n",
|
| 2082 |
+
" #print (\"new row size {}\".format(self.newRowSize))\n",
|
| 2083 |
+
" self.trueVal = trueVal\n",
|
| 2084 |
+
" self.falseVal = falseVal\n",
|
| 2085 |
+
" self.delim = delim\n",
|
| 2086 |
+
"\n",
|
| 2087 |
+
" def processRow(self, row):\n",
|
| 2088 |
+
" \"\"\"\n",
|
| 2089 |
+
" encodes categorical variables, returning as delemeter separate dstring or list\n",
|
| 2090 |
+
"\n",
|
| 2091 |
+
" Parameters\n",
|
| 2092 |
+
" row : row either delemeter separated string or list\n",
|
| 2093 |
+
" \"\"\"\n",
|
| 2094 |
+
" if self.delim is not None:\n",
|
| 2095 |
+
" rowArr = row.split(self.delim)\n",
|
| 2096 |
+
" msg = \"row does not have expected number of columns found \" + str(len(rowArr)) + \" expected \" + str(self.rowSize)\n",
|
| 2097 |
+
" assert len(rowArr) == self.rowSize, msg\n",
|
| 2098 |
+
" else:\n",
|
| 2099 |
+
" rowArr = row\n",
|
| 2100 |
+
"\n",
|
| 2101 |
+
" newRowArr = []\n",
|
| 2102 |
+
" for i in range(len(rowArr)):\n",
|
| 2103 |
+
" curVal = rowArr[i]\n",
|
| 2104 |
+
" if (i in self.catValues):\n",
|
| 2105 |
+
" values = self.catValues[i]\n",
|
| 2106 |
+
" for val in values:\n",
|
| 2107 |
+
" if val == curVal:\n",
|
| 2108 |
+
" newVal = self.trueVal\n",
|
| 2109 |
+
" else:\n",
|
| 2110 |
+
" newVal = self.falseVal\n",
|
| 2111 |
+
" newRowArr.append(newVal)\n",
|
| 2112 |
+
" else:\n",
|
| 2113 |
+
" newRowArr.append(curVal)\n",
|
| 2114 |
+
" assert len(newRowArr) == self.newRowSize, \"invalid new row size \" + str(len(newRowArr)) + \" expected \" + str(self.newRowSize)\n",
|
| 2115 |
+
" encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr\n",
|
| 2116 |
+
" return encRow\n"
|
| 2117 |
+
]
|
| 2118 |
+
}
|
| 2119 |
+
],
|
| 2120 |
+
"metadata": {
|
| 2121 |
+
"kernelspec": {
|
| 2122 |
+
"display_name": "Python 3 (ipykernel)",
|
| 2123 |
+
"language": "python",
|
| 2124 |
+
"name": "python3"
|
| 2125 |
+
},
|
| 2126 |
+
"language_info": {
|
| 2127 |
+
"codemirror_mode": {
|
| 2128 |
+
"name": "ipython",
|
| 2129 |
+
"version": 3
|
| 2130 |
+
},
|
| 2131 |
+
"file_extension": ".py",
|
| 2132 |
+
"mimetype": "text/x-python",
|
| 2133 |
+
"name": "python",
|
| 2134 |
+
"nbconvert_exporter": "python",
|
| 2135 |
+
"pygments_lexer": "ipython3",
|
| 2136 |
+
"version": "3.9.12"
|
| 2137 |
+
}
|
| 2138 |
+
},
|
| 2139 |
+
"nbformat": 4,
|
| 2140 |
+
"nbformat_minor": 5
|
| 2141 |
+
}
|
model/tnn/pdamb.mod
ADDED
|
Binary file (1.45 kB). View file
|
|
|