|
|
@@ -453,36 +453,81 @@ def check_chatroom(userName): |
|
|
|
|
|
|
|
# return text |
|
|
|
|
|
|
|
def remove_markdown_symbol(text: str): |
|
|
|
# 去除标题 |
|
|
|
text = re.sub(r'#+\s*', '', text) |
|
|
|
# def remove_markdown_symbol(text: str): |
|
|
|
# # 去除标题 |
|
|
|
# text = re.sub(r'#+\s*', '', text) |
|
|
|
|
|
|
|
# 去除粗体和斜体 |
|
|
|
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) |
|
|
|
text = re.sub(r'__([^_]+)__', r'\1', text) |
|
|
|
text = re.sub(r'\*([^*]+)\*', r'\1', text) |
|
|
|
text = re.sub(r'_([^_]+)_', r'\1', text) |
|
|
|
# # 去除粗体和斜体 |
|
|
|
# text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) |
|
|
|
# text = re.sub(r'__([^_]+)__', r'\1', text) |
|
|
|
# text = re.sub(r'\*([^*]+)\*', r'\1', text) |
|
|
|
# text = re.sub(r'_([^_]+)_', r'\1', text) |
|
|
|
|
|
|
|
# # 保留链接地址 |
|
|
|
# text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) |
|
|
|
|
|
|
|
# # 保留图片地址 |
|
|
|
# text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) |
|
|
|
|
|
|
|
# # 去除列表 |
|
|
|
# text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE) |
|
|
|
# text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) |
|
|
|
|
|
|
|
# # 去除代码块和内联代码 |
|
|
|
# text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL) |
|
|
|
# text = re.sub(r'`([^`]+)`', r'\1', text) |
|
|
|
|
|
|
|
# 保留链接地址 |
|
|
|
text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) |
|
|
|
# # 去除引用 |
|
|
|
# text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE) |
|
|
|
|
|
|
|
# 保留图片地址 |
|
|
|
text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) |
|
|
|
# # 去除水平线 |
|
|
|
# text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) |
|
|
|
|
|
|
|
# return text.strip() |
|
|
|
|
|
|
|
def remove_markdown_symbol(text: str)->str: |
|
|
|
url_placeholders = [] |
|
|
|
|
|
|
|
def url_replacer(url): |
|
|
|
url_placeholders.append(url) |
|
|
|
return f"[[URL{len(url_placeholders)-1}]]" |
|
|
|
|
|
|
|
# 先处理 Markdown 图片语法:  |
|
|
|
text = re.sub(r'!\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text) |
|
|
|
|
|
|
|
# 处理 Markdown 链接语法: [text](url) |
|
|
|
text = re.sub(r'\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text) |
|
|
|
|
|
|
|
# 再处理裸链接 |
|
|
|
text = re.sub(r'https?://[^\s)]+', lambda m: url_replacer(m.group(0)), text) |
|
|
|
|
|
|
|
# 去除标题 |
|
|
|
text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE) |
|
|
|
|
|
|
|
# 去除粗体、斜体(现在不会破坏 URL 中的 _ 了) |
|
|
|
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) |
|
|
|
text = re.sub(r'__([^_]+)__', r'\1', text) |
|
|
|
text = re.sub(r'\*([^\*]+)\*', r'\1', text) |
|
|
|
text = re.sub(r'_([^_]+)_', r'\1', text) |
|
|
|
|
|
|
|
# 去除列表 |
|
|
|
text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE) |
|
|
|
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) |
|
|
|
|
|
|
|
|
|
|
|
# 去除代码块和内联代码 |
|
|
|
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL) |
|
|
|
text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) |
|
|
|
text = re.sub(r'`([^`]+)`', r'\1', text) |
|
|
|
|
|
|
|
|
|
|
|
# 去除引用 |
|
|
|
text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE) |
|
|
|
|
|
|
|
|
|
|
|
# 去除水平线 |
|
|
|
text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) |
|
|
|
|
|
|
|
|
|
|
|
# 恢复 URL |
|
|
|
for i, url in enumerate(url_placeholders): |
|
|
|
text = text.replace(f"[[URL{i}]]", url) |
|
|
|
|
|
|
|
return text.strip() |
|
|
|
|
|
|
|
|
|
|
|