From a1054365071deb631aa5e41c5fb3e07a8eef4848 Mon Sep 17 00:00:00 2001 From: H Vs Date: Tue, 8 Apr 2025 17:23:52 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8E=BB=E9=99=A4md=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- common/utils.py | 90 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 20 deletions(-) diff --git a/common/utils.py b/common/utils.py index 6984b47..b0db046 100644 --- a/common/utils.py +++ b/common/utils.py @@ -404,31 +404,81 @@ def check_chatroom(userName): return True return False -def remove_markdown_symbol(text: str): - # 移除markdown格式,目前先移除** - if not text or not isinstance(text, str): - return text - # 去除加粗、斜体等格式 - #text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # 去除加粗 - text=re.sub(r'\*\*(.*?)\*\*', r'\1', text) - text = re.sub(r'\*([^*]+)\*', r'\1', text) # 去除斜体 - text = re.sub(r'__([^_]+)__', r'\1', text) # 去除加粗(下划线) - text = re.sub(r'_(.*?)_', r'\1', text) # 去除斜体(下划线) +# def remove_markdown_symbol(text: str): +# # 移除markdown格式,目前先移除** +# if not text or not isinstance(text, str): +# return text +# # 去除加粗、斜体等格式 +# #text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # 去除加粗 +# text=re.sub(r'\*\*(.*?)\*\*', r'\1', text) +# text = re.sub(r'\*([^*]+)\*', r'\1', text) # 去除斜体 +# text = re.sub(r'__([^_]+)__', r'\1', text) # 去除加粗(下划线) +# text = re.sub(r'_(.*?)_', r'\1', text) # 去除斜体(下划线) - # 去除行内代码块 - text = re.sub(r'`([^`]+)`', r'\1', text) +# # 去除行内代码块 +# text = re.sub(r'`([^`]+)`', r'\1', text) - # 去除换行符\n,或者多余的空格 - #text = re.sub(r'\n+', ' ', text) +# # 去除换行符\n,或者多余的空格 +# #text = re.sub(r'\n+', ' ', text) - # 去除列表编号等 - #text = re.sub(r'^\d+\.\s*', '', text, flags=re.MULTILINE) +# # 去除列表编号等 +# #text = re.sub(r'^\d+\.\s*', '', text, flags=re.MULTILINE) + +# #text = re.sub('[\\\`\*\_\[\]\#\+\-\!\>]', '', text) +# text = re.sub('[\\\`\*\_\[\]\#\+\!\>]', '', text) +# print(text) +# return text - #text = re.sub('[\\\`\*\_\[\]\#\+\-\!\>]', '', text) - text = re.sub('[\\\`\*\_\[\]\#\+\!\>]', '', text) - print(text) - return text +# def remove_markdown_symbol(text: str): +# if not text or not isinstance(text, str): +# return text + +# # 去除加粗、斜体等格式 +# text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # 去除加粗 +# text = re.sub(r'\*([^*]+)\*', r'\1', text) # 去除斜体 +# text = re.sub(r'__([^_]+)__', r'\1', text) # 去除加粗(下划线) +# text = re.sub(r'_(.*?)_', r'\1', text) # 去除斜体(下划线) + +# # 去除行内代码块 +# text = re.sub(r'`([^`]+)`', r'\1', text) + +# # 去除其他 Markdown 符号 +# text = re.sub('[\\\`\*\_\[\]\#\+\!\>]', '', text) + +# return text + +def remove_markdown_symbol(text: str): + # 去除标题 + text = re.sub(r'#+\s*', '', text) + + # 去除粗体和斜体 + text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) + text = re.sub(r'__([^_]+)__', r'\1', text) + text = re.sub(r'\*([^*]+)\*', r'\1', text) + text = re.sub(r'_([^_]+)_', r'\1', text) + + # 保留链接地址 + text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) + + # 保留图片地址 + text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) + + # 去除列表 + text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE) + text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) + + # 去除代码块和内联代码 + text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL) + text = re.sub(r'`([^`]+)`', r'\1', text) + + # 去除引用 + text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE) + + # 去除水平线 + text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) + + return text.strip() async def save_to_local_from_url_async(url):