250 line
7.9KB

  1. #!/usr/bin/env python
  2. # -*- coding:utf-8 -*-
  3. # ToolGood.Words.WordsSearch.py
  4. # 2020, Lin Zhijun, https://github.com/toolgood/ToolGood.Words
  5. # Licensed under the Apache License 2.0
  6. # 更新日志
  7. # 2020.04.06 第一次提交
  8. # 2020.05.16 修改,支持大于0xffff的字符
  9. __all__ = ['WordsSearch']
  10. __author__ = 'Lin Zhijun'
  11. __date__ = '2020.05.16'
  12. class TrieNode():
  13. def __init__(self):
  14. self.Index = 0
  15. self.Index = 0
  16. self.Layer = 0
  17. self.End = False
  18. self.Char = ''
  19. self.Results = []
  20. self.m_values = {}
  21. self.Failure = None
  22. self.Parent = None
  23. def Add(self,c):
  24. if c in self.m_values :
  25. return self.m_values[c]
  26. node = TrieNode()
  27. node.Parent = self
  28. node.Char = c
  29. self.m_values[c] = node
  30. return node
  31. def SetResults(self,index):
  32. if (self.End == False):
  33. self.End = True
  34. self.Results.append(index)
  35. class TrieNode2():
  36. def __init__(self):
  37. self.End = False
  38. self.Results = []
  39. self.m_values = {}
  40. self.minflag = 0xffff
  41. self.maxflag = 0
  42. def Add(self,c,node3):
  43. if (self.minflag > c):
  44. self.minflag = c
  45. if (self.maxflag < c):
  46. self.maxflag = c
  47. self.m_values[c] = node3
  48. def SetResults(self,index):
  49. if (self.End == False) :
  50. self.End = True
  51. if (index in self.Results )==False :
  52. self.Results.append(index)
  53. def HasKey(self,c):
  54. return c in self.m_values
  55. def TryGetValue(self,c):
  56. if (self.minflag <= c and self.maxflag >= c):
  57. if c in self.m_values:
  58. return self.m_values[c]
  59. return None
  60. class WordsSearch():
  61. def __init__(self):
  62. self._first = {}
  63. self._keywords = []
  64. self._indexs=[]
  65. def SetKeywords(self,keywords):
  66. self._keywords = keywords
  67. self._indexs=[]
  68. for i in range(len(keywords)):
  69. self._indexs.append(i)
  70. root = TrieNode()
  71. allNodeLayer={}
  72. for i in range(len(self._keywords)): # for (i = 0; i < _keywords.length; i++)
  73. p = self._keywords[i]
  74. nd = root
  75. for j in range(len(p)): # for (j = 0; j < p.length; j++)
  76. nd = nd.Add(ord(p[j]))
  77. if (nd.Layer == 0):
  78. nd.Layer = j + 1
  79. if nd.Layer in allNodeLayer:
  80. allNodeLayer[nd.Layer].append(nd)
  81. else:
  82. allNodeLayer[nd.Layer]=[]
  83. allNodeLayer[nd.Layer].append(nd)
  84. nd.SetResults(i)
  85. allNode = []
  86. allNode.append(root)
  87. for key in allNodeLayer.keys():
  88. for nd in allNodeLayer[key]:
  89. allNode.append(nd)
  90. allNodeLayer=None
  91. for i in range(len(allNode)): # for (i = 0; i < allNode.length; i++)
  92. if i==0 :
  93. continue
  94. nd=allNode[i]
  95. nd.Index = i
  96. r = nd.Parent.Failure
  97. c = nd.Char
  98. while (r != None and (c in r.m_values)==False):
  99. r = r.Failure
  100. if (r == None):
  101. nd.Failure = root
  102. else:
  103. nd.Failure = r.m_values[c]
  104. for key2 in nd.Failure.Results :
  105. nd.SetResults(key2)
  106. root.Failure = root
  107. allNode2 = []
  108. for i in range(len(allNode)): # for (i = 0; i < allNode.length; i++)
  109. allNode2.append( TrieNode2())
  110. for i in range(len(allNode2)): # for (i = 0; i < allNode2.length; i++)
  111. oldNode = allNode[i]
  112. newNode = allNode2[i]
  113. for key in oldNode.m_values :
  114. index = oldNode.m_values[key].Index
  115. newNode.Add(key, allNode2[index])
  116. for index in range(len(oldNode.Results)): # for (index = 0; index < oldNode.Results.length; index++)
  117. item = oldNode.Results[index]
  118. newNode.SetResults(item)
  119. oldNode=oldNode.Failure
  120. while oldNode != root:
  121. for key in oldNode.m_values :
  122. if (newNode.HasKey(key) == False):
  123. index = oldNode.m_values[key].Index
  124. newNode.Add(key, allNode2[index])
  125. for index in range(len(oldNode.Results)):
  126. item = oldNode.Results[index]
  127. newNode.SetResults(item)
  128. oldNode=oldNode.Failure
  129. allNode = None
  130. root = None
  131. # first = []
  132. # for index in range(65535):# for (index = 0; index < 0xffff; index++)
  133. # first.append(None)
  134. # for key in allNode2[0].m_values :
  135. # first[key] = allNode2[0].m_values[key]
  136. self._first = allNode2[0]
  137. def FindFirst(self,text):
  138. ptr = None
  139. for index in range(len(text)): # for (index = 0; index < text.length; index++)
  140. t =ord(text[index]) # text.charCodeAt(index)
  141. tn = None
  142. if (ptr == None):
  143. tn = self._first.TryGetValue(t)
  144. else:
  145. tn = ptr.TryGetValue(t)
  146. if (tn==None):
  147. tn = self._first.TryGetValue(t)
  148. if (tn != None):
  149. if (tn.End):
  150. item = tn.Results[0]
  151. keyword = self._keywords[item]
  152. return { "Keyword": keyword, "Success": True, "End": index, "Start": index + 1 - len(keyword), "Index": self._indexs[item] }
  153. ptr = tn
  154. return None
  155. def FindAll(self,text):
  156. ptr = None
  157. list = []
  158. for index in range(len(text)): # for (index = 0; index < text.length; index++)
  159. t =ord(text[index]) # text.charCodeAt(index)
  160. tn = None
  161. if (ptr == None):
  162. tn = self._first.TryGetValue(t)
  163. else:
  164. tn = ptr.TryGetValue(t)
  165. if (tn==None):
  166. tn = self._first.TryGetValue(t)
  167. if (tn != None):
  168. if (tn.End):
  169. for j in range(len(tn.Results)): # for (j = 0; j < tn.Results.length; j++)
  170. item = tn.Results[j]
  171. keyword = self._keywords[item]
  172. list.append({ "Keyword": keyword, "Success": True, "End": index, "Start": index + 1 - len(keyword), "Index": self._indexs[item] })
  173. ptr = tn
  174. return list
  175. def ContainsAny(self,text):
  176. ptr = None
  177. for index in range(len(text)): # for (index = 0; index < text.length; index++)
  178. t =ord(text[index]) # text.charCodeAt(index)
  179. tn = None
  180. if (ptr == None):
  181. tn = self._first.TryGetValue(t)
  182. else:
  183. tn = ptr.TryGetValue(t)
  184. if (tn==None):
  185. tn = self._first.TryGetValue(t)
  186. if (tn != None):
  187. if (tn.End):
  188. return True
  189. ptr = tn
  190. return False
  191. def Replace(self,text, replaceChar = '*'):
  192. result = list(text)
  193. ptr = None
  194. for i in range(len(text)): # for (i = 0; i < text.length; i++)
  195. t =ord(text[i]) # text.charCodeAt(index)
  196. tn = None
  197. if (ptr == None):
  198. tn = self._first.TryGetValue(t)
  199. else:
  200. tn = ptr.TryGetValue(t)
  201. if (tn==None):
  202. tn = self._first.TryGetValue(t)
  203. if (tn != None):
  204. if (tn.End):
  205. maxLength = len( self._keywords[tn.Results[0]])
  206. start = i + 1 - maxLength
  207. for j in range(start,i+1): # for (j = start; j <= i; j++)
  208. result[j] = replaceChar
  209. ptr = tn
  210. return ''.join(result)