Spaces:
Runtime error
Runtime error
| import os | |
| import sys | |
| import random | |
| import statistics | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import threading | |
| import time | |
| import queue | |
| sys.path.append(os.path.abspath("../lib")) | |
| sys.path.append(os.path.abspath("../supv")) | |
| sys.path.append(os.path.abspath("../text")) | |
| from util import * | |
| from sampler import * | |
| from tnn import * | |
| from txproc import * | |
| import streamlit as st | |
| emailDoms = ["yahoo.com", "gmail.com", "hotmail.com", "aol.com"] | |
| st.title("Duplicate Records Prediction") | |
| def printNgramVec(ngv): | |
| """ | |
| print ngram vector | |
| """ | |
| print("ngram vector") | |
| for i in range(len(ngv)): | |
| if ngv[i] > 0: | |
| print("{} {}".format(i, ngv[i])) | |
| def createNegMatch(tdata, ri): | |
| """ | |
| create negative match by randomly selecting another record | |
| """ | |
| nri = randomInt(0, len(tdata)-1) | |
| while nri == ri: | |
| nri = randomInt(0, len(tdata)-1) | |
| return tdata[nri] | |
| def createNgramCreator(): | |
| """ create ngram creator """ | |
| cng = CharNGram(["lcc", "ucc", "dig"], 3, True) | |
| spc = ["@", "#", "_", "-", "."] | |
| cng.addSpChar(spc) | |
| cng.setWsRepl("$") | |
| cng.finalize() | |
| return cng | |
| def getSim(rec, incOutput=True): | |
| """ get rec pair similarity """ | |
| #print(rec) | |
| sim = list() | |
| for i in range(6): | |
| #print("field " + str(i)) | |
| if i == 3: | |
| s = levenshteinSimilarity(rec[i],rec[i+6]) | |
| else: | |
| ngv1 = cng.toMgramCount(rec[i]) | |
| ngv2 = cng.toMgramCount(rec[i+6]) | |
| #printNgramVec(ngv1) | |
| #printNgramVec(ngv2) | |
| s = cosineSimilarity(ngv1, ngv2) | |
| sim.append(s) | |
| ss = toStrFromList(sim, 6) | |
| srec = ss + "," + rec[-1] if incOutput else ss | |
| return srec | |
| class SimThread (threading.Thread): | |
| """ multi threaded similarity calculation """ | |
| def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize): | |
| """ initialize """ | |
| threading.Thread.__init__(self) | |
| self.tName = tName | |
| self.cng = cng | |
| self.qu = qu | |
| self.incOutput = incOutput | |
| self.outQu = outQu | |
| self.outQuSize = outQuSize | |
| def run(self): | |
| """ exeution """ | |
| exitFlag=False | |
| while not exitFlag: | |
| rec = dequeue(self.qu, workQuLock) | |
| if rec is not None: | |
| srec = getSim(rec, self.incOutput) | |
| if outQu is None: | |
| print(srec) | |
| else: | |
| enqueue(srec, self.outQu, outQuLock, self.outQuSize) | |
| def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize): | |
| """create worker threads """ | |
| threadList = list(map(lambda i : "Thread-" + str(i+1), range(nworker))) | |
| threads = list() | |
| for tName in threadList: | |
| thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize) | |
| thread.start() | |
| threads.append(thread) | |
| return threads | |
| def enqueue(rec, qu, quLock, qSize): | |
| """ enqueue record """ | |
| queued = False | |
| while not queued: | |
| quLock.acquire() | |
| if qu.qsize() < qSize - 1: | |
| qu.put(rec) | |
| queued = True | |
| quLock.release() | |
| time.sleep(1) | |
| def dequeue(qu, quLock): | |
| """ dequeue record """ | |
| rec = None | |
| quLock.acquire() | |
| if not qu.empty(): | |
| rec = qu.get() | |
| quLock.release() | |
| return rec | |
| test_file = 'pers_new_dup.txt' | |
| exist_file = 'pers_exist.txt' | |
| prop_file = 'tnn_disamb.properties' | |
| def predict_main(test_file,exist_file,prop_file): | |
| #multi threading related | |
| workQuLock = threading.Lock() | |
| outQuLock = threading.Lock() | |
| exitFlag = False | |
| """ predict with neural network model """ | |
| newFilePath = test_file | |
| existFilePath = exist_file | |
| nworker = 1 | |
| prFile = prop_file | |
| regr = FeedForwardNetwork(prFile) | |
| regr.buildModel() | |
| cng = createNgramCreator() | |
| #create threads | |
| qSize = 100 | |
| workQu = queue.Queue(qSize) | |
| outQu = queue.Queue(qSize) | |
| threads = createThreads(nworker, cng, workQu, False, outQu, qSize) | |
| for nrec in fileRecGen(newFilePath): | |
| srecs = list() | |
| ecount = 0 | |
| y_pred = [] | |
| #print("processing ", nrec) | |
| for erec in fileRecGen(existFilePath): | |
| rec = nrec.copy() | |
| rec.extend(erec) | |
| #print(rec) | |
| enqueue(rec, workQu, workQuLock, qSize) | |
| srec = dequeue(outQu, outQuLock) | |
| if srec is not None: | |
| srecs.append(strToFloatArray(srec)) | |
| ecount += 1 | |
| #wait til workq queue is drained | |
| while not workQu.empty(): | |
| pass | |
| #drain out queue | |
| while len(srecs) < ecount: | |
| srec = dequeue(outQu, outQuLock) | |
| if srec is not None: | |
| srecs.append(strToFloatArray(srec)) | |
| #predict | |
| simMax = 0 | |
| sims = FeedForwardNetwork.predict(regr, srecs) | |
| sims = sims.reshape(sims.shape[0]) | |
| y_pred.append(max(sims)) | |
| #print("{} {:.3f}".format(nrec, y_pred)) | |
| print(nrec, max(y_pred)) | |
| exitFlag = True | |
| predict_main(test_file,exist_file,prop_file) | |
| st.header("End") |