|
|
@@ -0,0 +1,250 @@ |
|
|
|
#!/usr/bin/env python |
|
|
|
# -*- coding:utf-8 -*- |
|
|
|
# ToolGood.Words.WordsSearch.py |
|
|
|
# 2020, Lin Zhijun, https://github.com/toolgood/ToolGood.Words |
|
|
|
# Licensed under the Apache License 2.0 |
|
|
|
# 更新日志 |
|
|
|
# 2020.04.06 第一次提交 |
|
|
|
# 2020.05.16 修改,支持大于0xffff的字符 |
|
|
|
|
|
|
|
__all__ = ['WordsSearch'] |
|
|
|
__author__ = 'Lin Zhijun' |
|
|
|
__date__ = '2020.05.16' |
|
|
|
|
|
|
|
class TrieNode(): |
|
|
|
def __init__(self): |
|
|
|
self.Index = 0 |
|
|
|
self.Index = 0 |
|
|
|
self.Layer = 0 |
|
|
|
self.End = False |
|
|
|
self.Char = '' |
|
|
|
self.Results = [] |
|
|
|
self.m_values = {} |
|
|
|
self.Failure = None |
|
|
|
self.Parent = None |
|
|
|
|
|
|
|
def Add(self,c): |
|
|
|
if c in self.m_values : |
|
|
|
return self.m_values[c] |
|
|
|
node = TrieNode() |
|
|
|
node.Parent = self |
|
|
|
node.Char = c |
|
|
|
self.m_values[c] = node |
|
|
|
return node |
|
|
|
|
|
|
|
def SetResults(self,index): |
|
|
|
if (self.End == False): |
|
|
|
self.End = True |
|
|
|
self.Results.append(index) |
|
|
|
|
|
|
|
class TrieNode2(): |
|
|
|
def __init__(self): |
|
|
|
self.End = False |
|
|
|
self.Results = [] |
|
|
|
self.m_values = {} |
|
|
|
self.minflag = 0xffff |
|
|
|
self.maxflag = 0 |
|
|
|
|
|
|
|
def Add(self,c,node3): |
|
|
|
if (self.minflag > c): |
|
|
|
self.minflag = c |
|
|
|
if (self.maxflag < c): |
|
|
|
self.maxflag = c |
|
|
|
self.m_values[c] = node3 |
|
|
|
|
|
|
|
def SetResults(self,index): |
|
|
|
if (self.End == False) : |
|
|
|
self.End = True |
|
|
|
if (index in self.Results )==False : |
|
|
|
self.Results.append(index) |
|
|
|
|
|
|
|
def HasKey(self,c): |
|
|
|
return c in self.m_values |
|
|
|
|
|
|
|
|
|
|
|
def TryGetValue(self,c): |
|
|
|
if (self.minflag <= c and self.maxflag >= c): |
|
|
|
if c in self.m_values: |
|
|
|
return self.m_values[c] |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
class WordsSearch(): |
|
|
|
def __init__(self): |
|
|
|
self._first = {} |
|
|
|
self._keywords = [] |
|
|
|
self._indexs=[] |
|
|
|
|
|
|
|
def SetKeywords(self,keywords): |
|
|
|
self._keywords = keywords |
|
|
|
self._indexs=[] |
|
|
|
for i in range(len(keywords)): |
|
|
|
self._indexs.append(i) |
|
|
|
|
|
|
|
root = TrieNode() |
|
|
|
allNodeLayer={} |
|
|
|
|
|
|
|
for i in range(len(self._keywords)): # for (i = 0; i < _keywords.length; i++) |
|
|
|
p = self._keywords[i] |
|
|
|
nd = root |
|
|
|
for j in range(len(p)): # for (j = 0; j < p.length; j++) |
|
|
|
nd = nd.Add(ord(p[j])) |
|
|
|
if (nd.Layer == 0): |
|
|
|
nd.Layer = j + 1 |
|
|
|
if nd.Layer in allNodeLayer: |
|
|
|
allNodeLayer[nd.Layer].append(nd) |
|
|
|
else: |
|
|
|
allNodeLayer[nd.Layer]=[] |
|
|
|
allNodeLayer[nd.Layer].append(nd) |
|
|
|
nd.SetResults(i) |
|
|
|
|
|
|
|
|
|
|
|
allNode = [] |
|
|
|
allNode.append(root) |
|
|
|
for key in allNodeLayer.keys(): |
|
|
|
for nd in allNodeLayer[key]: |
|
|
|
allNode.append(nd) |
|
|
|
allNodeLayer=None |
|
|
|
|
|
|
|
for i in range(len(allNode)): # for (i = 0; i < allNode.length; i++) |
|
|
|
if i==0 : |
|
|
|
continue |
|
|
|
nd=allNode[i] |
|
|
|
nd.Index = i |
|
|
|
r = nd.Parent.Failure |
|
|
|
c = nd.Char |
|
|
|
while (r != None and (c in r.m_values)==False): |
|
|
|
r = r.Failure |
|
|
|
if (r == None): |
|
|
|
nd.Failure = root |
|
|
|
else: |
|
|
|
nd.Failure = r.m_values[c] |
|
|
|
for key2 in nd.Failure.Results : |
|
|
|
nd.SetResults(key2) |
|
|
|
root.Failure = root |
|
|
|
|
|
|
|
allNode2 = [] |
|
|
|
for i in range(len(allNode)): # for (i = 0; i < allNode.length; i++) |
|
|
|
allNode2.append( TrieNode2()) |
|
|
|
|
|
|
|
for i in range(len(allNode2)): # for (i = 0; i < allNode2.length; i++) |
|
|
|
oldNode = allNode[i] |
|
|
|
newNode = allNode2[i] |
|
|
|
|
|
|
|
for key in oldNode.m_values : |
|
|
|
index = oldNode.m_values[key].Index |
|
|
|
newNode.Add(key, allNode2[index]) |
|
|
|
|
|
|
|
for index in range(len(oldNode.Results)): # for (index = 0; index < oldNode.Results.length; index++) |
|
|
|
item = oldNode.Results[index] |
|
|
|
newNode.SetResults(item) |
|
|
|
|
|
|
|
oldNode=oldNode.Failure |
|
|
|
while oldNode != root: |
|
|
|
for key in oldNode.m_values : |
|
|
|
if (newNode.HasKey(key) == False): |
|
|
|
index = oldNode.m_values[key].Index |
|
|
|
newNode.Add(key, allNode2[index]) |
|
|
|
for index in range(len(oldNode.Results)): |
|
|
|
item = oldNode.Results[index] |
|
|
|
newNode.SetResults(item) |
|
|
|
oldNode=oldNode.Failure |
|
|
|
allNode = None |
|
|
|
root = None |
|
|
|
|
|
|
|
# first = [] |
|
|
|
# for index in range(65535):# for (index = 0; index < 0xffff; index++) |
|
|
|
# first.append(None) |
|
|
|
|
|
|
|
# for key in allNode2[0].m_values : |
|
|
|
# first[key] = allNode2[0].m_values[key] |
|
|
|
|
|
|
|
self._first = allNode2[0] |
|
|
|
|
|
|
|
|
|
|
|
def FindFirst(self,text): |
|
|
|
ptr = None |
|
|
|
for index in range(len(text)): # for (index = 0; index < text.length; index++) |
|
|
|
t =ord(text[index]) # text.charCodeAt(index) |
|
|
|
tn = None |
|
|
|
if (ptr == None): |
|
|
|
tn = self._first.TryGetValue(t) |
|
|
|
else: |
|
|
|
tn = ptr.TryGetValue(t) |
|
|
|
if (tn==None): |
|
|
|
tn = self._first.TryGetValue(t) |
|
|
|
|
|
|
|
|
|
|
|
if (tn != None): |
|
|
|
if (tn.End): |
|
|
|
item = tn.Results[0] |
|
|
|
keyword = self._keywords[item] |
|
|
|
return { "Keyword": keyword, "Success": True, "End": index, "Start": index + 1 - len(keyword), "Index": self._indexs[item] } |
|
|
|
ptr = tn |
|
|
|
return None |
|
|
|
|
|
|
|
def FindAll(self,text): |
|
|
|
ptr = None |
|
|
|
list = [] |
|
|
|
|
|
|
|
for index in range(len(text)): # for (index = 0; index < text.length; index++) |
|
|
|
t =ord(text[index]) # text.charCodeAt(index) |
|
|
|
tn = None |
|
|
|
if (ptr == None): |
|
|
|
tn = self._first.TryGetValue(t) |
|
|
|
else: |
|
|
|
tn = ptr.TryGetValue(t) |
|
|
|
if (tn==None): |
|
|
|
tn = self._first.TryGetValue(t) |
|
|
|
|
|
|
|
|
|
|
|
if (tn != None): |
|
|
|
if (tn.End): |
|
|
|
for j in range(len(tn.Results)): # for (j = 0; j < tn.Results.length; j++) |
|
|
|
item = tn.Results[j] |
|
|
|
keyword = self._keywords[item] |
|
|
|
list.append({ "Keyword": keyword, "Success": True, "End": index, "Start": index + 1 - len(keyword), "Index": self._indexs[item] }) |
|
|
|
ptr = tn |
|
|
|
return list |
|
|
|
|
|
|
|
|
|
|
|
def ContainsAny(self,text): |
|
|
|
ptr = None |
|
|
|
for index in range(len(text)): # for (index = 0; index < text.length; index++) |
|
|
|
t =ord(text[index]) # text.charCodeAt(index) |
|
|
|
tn = None |
|
|
|
if (ptr == None): |
|
|
|
tn = self._first.TryGetValue(t) |
|
|
|
else: |
|
|
|
tn = ptr.TryGetValue(t) |
|
|
|
if (tn==None): |
|
|
|
tn = self._first.TryGetValue(t) |
|
|
|
|
|
|
|
if (tn != None): |
|
|
|
if (tn.End): |
|
|
|
return True |
|
|
|
ptr = tn |
|
|
|
return False |
|
|
|
|
|
|
|
def Replace(self,text, replaceChar = '*'): |
|
|
|
result = list(text) |
|
|
|
|
|
|
|
ptr = None |
|
|
|
for i in range(len(text)): # for (i = 0; i < text.length; i++) |
|
|
|
t =ord(text[i]) # text.charCodeAt(index) |
|
|
|
tn = None |
|
|
|
if (ptr == None): |
|
|
|
tn = self._first.TryGetValue(t) |
|
|
|
else: |
|
|
|
tn = ptr.TryGetValue(t) |
|
|
|
if (tn==None): |
|
|
|
tn = self._first.TryGetValue(t) |
|
|
|
|
|
|
|
if (tn != None): |
|
|
|
if (tn.End): |
|
|
|
maxLength = len( self._keywords[tn.Results[0]]) |
|
|
|
start = i + 1 - maxLength |
|
|
|
for j in range(start,i+1): # for (j = start; j <= i; j++) |
|
|
|
result[j] = replaceChar |
|
|
|
ptr = tn |
|
|
|
return ''.join(result) |