From b5cf40acf164fc65d700e14fe59dc09d2b28c8eb Mon Sep 17 00:00:00 2001 From: H Vs Date: Wed, 9 Apr 2025 16:47:12 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8E=BBmd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- common/utils.py | 79 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/common/utils.py b/common/utils.py index e6a033f..e421c97 100644 --- a/common/utils.py +++ b/common/utils.py @@ -453,36 +453,81 @@ def check_chatroom(userName): # return text -def remove_markdown_symbol(text: str): - # 去除标题 - text = re.sub(r'#+\s*', '', text) +# def remove_markdown_symbol(text: str): +# # 去除标题 +# text = re.sub(r'#+\s*', '', text) - # 去除粗体和斜体 - text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) - text = re.sub(r'__([^_]+)__', r'\1', text) - text = re.sub(r'\*([^*]+)\*', r'\1', text) - text = re.sub(r'_([^_]+)_', r'\1', text) +# # 去除粗体和斜体 +# text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) +# text = re.sub(r'__([^_]+)__', r'\1', text) +# text = re.sub(r'\*([^*]+)\*', r'\1', text) +# text = re.sub(r'_([^_]+)_', r'\1', text) + +# # 保留链接地址 +# text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) + +# # 保留图片地址 +# text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) + +# # 去除列表 +# text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE) +# text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) + +# # 去除代码块和内联代码 +# text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL) +# text = re.sub(r'`([^`]+)`', r'\1', text) - # 保留链接地址 - text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) +# # 去除引用 +# text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE) - # 保留图片地址 - text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text) +# # 去除水平线 +# text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) +# return text.strip() + +def remove_markdown_symbol(text: str)->str: + url_placeholders = [] + + def url_replacer(url): + url_placeholders.append(url) + return f"[[URL{len(url_placeholders)-1}]]" + + # 先处理 Markdown 图片语法: ![alt](url) + text = re.sub(r'!\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text) + + # 处理 Markdown 链接语法: [text](url) + text = re.sub(r'\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text) + + # 再处理裸链接 + text = re.sub(r'https?://[^\s)]+', lambda m: url_replacer(m.group(0)), text) + + # 去除标题 + text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE) + + # 去除粗体、斜体(现在不会破坏 URL 中的 _ 了) + text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) + text = re.sub(r'__([^_]+)__', r'\1', text) + text = re.sub(r'\*([^\*]+)\*', r'\1', text) + text = re.sub(r'_([^_]+)_', r'\1', text) + # 去除列表 text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE) text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) - + # 去除代码块和内联代码 - text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL) + text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) text = re.sub(r'`([^`]+)`', r'\1', text) - + # 去除引用 text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE) - + # 去除水平线 text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) - + + # 恢复 URL + for i, url in enumerate(url_placeholders): + text = text.replace(f"[[URL{i}]]", url) + return text.strip()