From b5cf40acf164fc65d700e14fe59dc09d2b28c8eb Mon Sep 17 00:00:00 2001
From: H Vs <vson.iwork@outlook.com>
Date: Wed, 9 Apr 2025 16:47:12 +0800
Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8E=BBmd?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 common/utils.py | 79 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 17 deletions(-)

diff --git a/common/utils.py b/common/utils.py
index e6a033f..e421c97 100644
--- a/common/utils.py
+++ b/common/utils.py
@@ -453,36 +453,81 @@ def check_chatroom(userName):
     
 #     return text
 
-def remove_markdown_symbol(text: str):
-  # 去除标题
-    text = re.sub(r'#+\s*', '', text)
+# def remove_markdown_symbol(text: str):
+#   # 去除标题
+#     text = re.sub(r'#+\s*', '', text)
     
-    # 去除粗体和斜体
-    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
-    text = re.sub(r'__([^_]+)__', r'\1', text)
-    text = re.sub(r'\*([^*]+)\*', r'\1', text)
-    text = re.sub(r'_([^_]+)_', r'\1', text)
+#     # 去除粗体和斜体
+#     text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
+#     text = re.sub(r'__([^_]+)__', r'\1', text)
+#     text = re.sub(r'\*([^*]+)\*', r'\1', text)
+#     text = re.sub(r'_([^_]+)_', r'\1', text)
+    
+#     # 保留链接地址
+#     text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
+    
+#     # 保留图片地址
+#     text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
+    
+#     # 去除列表
+#     text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE)
+#     text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
+    
+#     # 去除代码块和内联代码
+#     text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
+#     text = re.sub(r'`([^`]+)`', r'\1', text)
     
-    # 保留链接地址
-    text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
+#     # 去除引用
+#     text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
     
-    # 保留图片地址
-    text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
+#     # 去除水平线
+#     text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
     
+#     return text.strip()
+
+def remove_markdown_symbol(text: str)->str:
+    url_placeholders = []
+
+    def url_replacer(url):
+        url_placeholders.append(url)
+        return f"[[URL{len(url_placeholders)-1}]]"
+
+    # 先处理 Markdown 图片语法: ![alt](url)
+    text = re.sub(r'!\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text)
+
+    # 处理 Markdown 链接语法: [text](url)
+    text = re.sub(r'\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text)
+
+    # 再处理裸链接
+    text = re.sub(r'https?://[^\s)]+', lambda m: url_replacer(m.group(0)), text)
+
+    # 去除标题
+    text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE)
+
+    # 去除粗体、斜体（现在不会破坏 URL 中的 _ 了）
+    text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
+    text = re.sub(r'__([^_]+)__', r'\1', text)
+    text = re.sub(r'\*([^\*]+)\*', r'\1', text)
+    text = re.sub(r'_([^_]+)_', r'\1', text)
+
     # 去除列表
     text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE)
     text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
-    
+
     # 去除代码块和内联代码
-    text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
+    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
     text = re.sub(r'`([^`]+)`', r'\1', text)
-    
+
     # 去除引用
     text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
-    
+
     # 去除水平线
     text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
-    
+
+    #  恢复 URL
+    for i, url in enumerate(url_placeholders):
+        text = text.replace(f"[[URL{i}]]", url)
+
     return text.strip()