|
- #!/usr/bin/env python
- # -*- coding:utf-8 -*-
- # ToolGood.Words.WordsSearch.py
- # 2020, Lin Zhijun, https://github.com/toolgood/ToolGood.Words
- # Licensed under the Apache License 2.0
- # 更新日志
- # 2020.04.06 第一次提交
- # 2020.05.16 修改,支持大于0xffff的字符
-
- __all__ = ['WordsSearch']
- __author__ = 'Lin Zhijun'
- __date__ = '2020.05.16'
-
- class TrieNode():
- def __init__(self):
- self.Index = 0
- self.Index = 0
- self.Layer = 0
- self.End = False
- self.Char = ''
- self.Results = []
- self.m_values = {}
- self.Failure = None
- self.Parent = None
-
- def Add(self,c):
- if c in self.m_values :
- return self.m_values[c]
- node = TrieNode()
- node.Parent = self
- node.Char = c
- self.m_values[c] = node
- return node
-
- def SetResults(self,index):
- if (self.End == False):
- self.End = True
- self.Results.append(index)
-
- class TrieNode2():
- def __init__(self):
- self.End = False
- self.Results = []
- self.m_values = {}
- self.minflag = 0xffff
- self.maxflag = 0
-
- def Add(self,c,node3):
- if (self.minflag > c):
- self.minflag = c
- if (self.maxflag < c):
- self.maxflag = c
- self.m_values[c] = node3
-
- def SetResults(self,index):
- if (self.End == False) :
- self.End = True
- if (index in self.Results )==False :
- self.Results.append(index)
-
- def HasKey(self,c):
- return c in self.m_values
-
-
- def TryGetValue(self,c):
- if (self.minflag <= c and self.maxflag >= c):
- if c in self.m_values:
- return self.m_values[c]
- return None
-
-
- class WordsSearch():
- def __init__(self):
- self._first = {}
- self._keywords = []
- self._indexs=[]
-
- def SetKeywords(self,keywords):
- self._keywords = keywords
- self._indexs=[]
- for i in range(len(keywords)):
- self._indexs.append(i)
-
- root = TrieNode()
- allNodeLayer={}
-
- for i in range(len(self._keywords)): # for (i = 0; i < _keywords.length; i++)
- p = self._keywords[i]
- nd = root
- for j in range(len(p)): # for (j = 0; j < p.length; j++)
- nd = nd.Add(ord(p[j]))
- if (nd.Layer == 0):
- nd.Layer = j + 1
- if nd.Layer in allNodeLayer:
- allNodeLayer[nd.Layer].append(nd)
- else:
- allNodeLayer[nd.Layer]=[]
- allNodeLayer[nd.Layer].append(nd)
- nd.SetResults(i)
-
-
- allNode = []
- allNode.append(root)
- for key in allNodeLayer.keys():
- for nd in allNodeLayer[key]:
- allNode.append(nd)
- allNodeLayer=None
-
- for i in range(len(allNode)): # for (i = 0; i < allNode.length; i++)
- if i==0 :
- continue
- nd=allNode[i]
- nd.Index = i
- r = nd.Parent.Failure
- c = nd.Char
- while (r != None and (c in r.m_values)==False):
- r = r.Failure
- if (r == None):
- nd.Failure = root
- else:
- nd.Failure = r.m_values[c]
- for key2 in nd.Failure.Results :
- nd.SetResults(key2)
- root.Failure = root
-
- allNode2 = []
- for i in range(len(allNode)): # for (i = 0; i < allNode.length; i++)
- allNode2.append( TrieNode2())
-
- for i in range(len(allNode2)): # for (i = 0; i < allNode2.length; i++)
- oldNode = allNode[i]
- newNode = allNode2[i]
-
- for key in oldNode.m_values :
- index = oldNode.m_values[key].Index
- newNode.Add(key, allNode2[index])
-
- for index in range(len(oldNode.Results)): # for (index = 0; index < oldNode.Results.length; index++)
- item = oldNode.Results[index]
- newNode.SetResults(item)
-
- oldNode=oldNode.Failure
- while oldNode != root:
- for key in oldNode.m_values :
- if (newNode.HasKey(key) == False):
- index = oldNode.m_values[key].Index
- newNode.Add(key, allNode2[index])
- for index in range(len(oldNode.Results)):
- item = oldNode.Results[index]
- newNode.SetResults(item)
- oldNode=oldNode.Failure
- allNode = None
- root = None
-
- # first = []
- # for index in range(65535):# for (index = 0; index < 0xffff; index++)
- # first.append(None)
-
- # for key in allNode2[0].m_values :
- # first[key] = allNode2[0].m_values[key]
-
- self._first = allNode2[0]
-
-
- def FindFirst(self,text):
- ptr = None
- for index in range(len(text)): # for (index = 0; index < text.length; index++)
- t =ord(text[index]) # text.charCodeAt(index)
- tn = None
- if (ptr == None):
- tn = self._first.TryGetValue(t)
- else:
- tn = ptr.TryGetValue(t)
- if (tn==None):
- tn = self._first.TryGetValue(t)
-
-
- if (tn != None):
- if (tn.End):
- item = tn.Results[0]
- keyword = self._keywords[item]
- return { "Keyword": keyword, "Success": True, "End": index, "Start": index + 1 - len(keyword), "Index": self._indexs[item] }
- ptr = tn
- return None
-
- def FindAll(self,text):
- ptr = None
- list = []
-
- for index in range(len(text)): # for (index = 0; index < text.length; index++)
- t =ord(text[index]) # text.charCodeAt(index)
- tn = None
- if (ptr == None):
- tn = self._first.TryGetValue(t)
- else:
- tn = ptr.TryGetValue(t)
- if (tn==None):
- tn = self._first.TryGetValue(t)
-
-
- if (tn != None):
- if (tn.End):
- for j in range(len(tn.Results)): # for (j = 0; j < tn.Results.length; j++)
- item = tn.Results[j]
- keyword = self._keywords[item]
- list.append({ "Keyword": keyword, "Success": True, "End": index, "Start": index + 1 - len(keyword), "Index": self._indexs[item] })
- ptr = tn
- return list
-
-
- def ContainsAny(self,text):
- ptr = None
- for index in range(len(text)): # for (index = 0; index < text.length; index++)
- t =ord(text[index]) # text.charCodeAt(index)
- tn = None
- if (ptr == None):
- tn = self._first.TryGetValue(t)
- else:
- tn = ptr.TryGetValue(t)
- if (tn==None):
- tn = self._first.TryGetValue(t)
-
- if (tn != None):
- if (tn.End):
- return True
- ptr = tn
- return False
-
- def Replace(self,text, replaceChar = '*'):
- result = list(text)
-
- ptr = None
- for i in range(len(text)): # for (i = 0; i < text.length; i++)
- t =ord(text[i]) # text.charCodeAt(index)
- tn = None
- if (ptr == None):
- tn = self._first.TryGetValue(t)
- else:
- tn = ptr.TryGetValue(t)
- if (tn==None):
- tn = self._first.TryGetValue(t)
-
- if (tn != None):
- if (tn.End):
- maxLength = len( self._keywords[tn.Results[0]])
- start = i + 1 - maxLength
- for j in range(start,i+1): # for (j = start; j <= i; j++)
- result[j] = replaceChar
- ptr = tn
- return ''.join(result)
|