2026数信杯AI决赛wp

ljnljn Lv6

最后排名第30(学生队第二),学生队一共也只有19队,主要还是公司居多
本来感觉没啥希望了,结果发现大模型越狱那里接的云端api模型,后面就还是顺一些

模型生命周期1

写爬虫爬取数据
本人因为忘记爬虫怎么写所以用的切片:(

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests

from hashlib import md5

submit_data=[]

def download_image(url):
    response = requests.get(url,cookies=cookies)

    if response.status_code == 200:

        with open("temp.png", "wb") as f:

            f.write(response.content)

        return md5(open("temp.png", "rb").read()).hexdigest()

for i in range(1,501):

    cookies={"session":"eyJsb2dnZWRfaW4iOnRydWUsInNlY3VyaXR5X3NlZWQiOiJuODdSMnlTbmRydHBaNnp1IiwidXNlcm5hbWUiOiJhZG1pbiJ9.abzkAQ.h11w8NWOL-sdaNF4uQOnwIl3mlY"}

    website="http://192.168.69.201:19301/patient/"+str(i)

    print(website)

    response = requests.get(website,cookies=cookies)

    print(response.status_code)

    print(response.text)

    s=response.text

    print(s.find("来自体检记录"))

    id=s[s.find("患者ID")+50:s.find("患者ID")+53]

    name=s[s.find("姓名")+48:s.find("姓名")+51]

    tel=s[s.find("电话")+48:s.find("电话")+59]

    review=s[s.find("来自体检记录")+138:s.find("来自体检记录")+238]

    pic=s[s.find("<img src=\"")+10:s.find(".png")+4]

    pic="http://192.168.69.201:19301"+pic

    print(pic)

    print(download_image(pic))

    if "</" in id:

        id=id[0]

    elif "<" in id:

        id=id[:2]

        print(id)

    if "</div>" in review:

        review=review[:review.find("</div>")]

    data=[id,name,tel,review,download_image(pic)]

    print(data)

    submit_data.append(data)

with open("submit1.csv", "w+") as f:

    headers = ["patient_id", "patient_name", "phone", "comment", "ct_image_md5"]

    f.write(",".join(headers) + "\n")

    for line in submit_data:

        f.write(",".join([str(v) for v in line]) + "\n")

模型生命周期2

用1的数据做人工标注就行,500条数据看哪个是正向哪个负面情绪,人工应该是最方便的

模型生命周期3

这里是用的大模型越狱那个ai
直接发到服务器就行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import pandas as pd  
import numpy as np
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
from flask import Flask, request, jsonify
from flask_cors import CORS
import warnings
warnings.filterwarnings('ignore')

# 1. 数据预处理和模型训练函数
def preprocess_text(text):
"""文本预处理函数"""
if not isinstance(text, str):
return ""
# 去除特殊字符和标点
text = re.sub(r'[^\w\s]', '', text)
# 去除数字
text = re.sub(r'\d+', '', text)
# 去除空白字符
text = text.strip()
# 使用jieba分词
words = jieba.lcut(text)
# 过滤停用词(简单版本)
stop_words = ['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这']
words = [word for word in words if word not in stop_words and len(word) > 1]
return ' '.join(words)

def train_model():
"""训练情感分析模型"""
print("开始加载数据集...")
try:
# 加载数据集
df = pd.read_csv('data.csv')
print(f"数据集加载成功,共{len(df)}条数据")
print(f"数据列名: {df.columns.tolist()}")
print(f"标签分布:\n{df['label'].value_counts()}")
except Exception as e:
print(f"加载数据集失败: {e}")
return None, None, None

# 数据预处理
print("开始数据预处理...")
df['cleaned_review'] = df['review'].apply(preprocess_text)
print(f"预处理完成,示例数据:\n{df[['review', 'cleaned_review', 'label']].head()}")

# 划分训练集和测试集
X = df['cleaned_review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"训练集大小: {len(X_train)}, 测试集大小: {len(X_test)}")

# 特征提取 - TF-IDF
print("开始特征提取...")
vectorizer = TfidfVectorizer(
max_features=5000,
ngram_range=(1, 2),
min_df=3,
max_df=0.9
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print(f"特征维度: {X_train_tfidf.shape}")

# 训练模型
print("开始训练模型...")
model = LogisticRegression(
C=1.0,
max_iter=1000,
random_state=42,
class_weight='balanced'
)
model.fit(X_train_tfidf, y_train)

# 评估模型
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"模型在测试集上的准确率: {accuracy:.4f}")

return model, vectorizer, accuracy

# 2. Flask API服务
app = Flask(__name__)
CORS(app)

# 全局变量存储模型和向量化器
model = None
vectorizer = None

@app.route('/predict', methods=['POST'])
def predict():
"""API预测接口"""
try:
# 获取请求数据
data = request.get_json()
if not data or 'text' not in data:
return jsonify({'error': '请求格式错误,需要{"text": "评论内容"}'}), 400

text = data['text']
if not text or not isinstance(text, str):
return jsonify({'error': 'text字段不能为空且必须是字符串'}), 400

# 预处理文本
cleaned_text = preprocess_text(text)
if not cleaned_text:
# 如果预处理后为空,返回默认值
return jsonify({
'result': 1, # 默认正向
'probability': 0.5
}), 200

# 特征提取
text_tfidf = vectorizer.transform([cleaned_text])
# 预测
prediction = model.predict(text_tfidf)[0]
probabilities = model.predict_proba(text_tfidf)[0]
# 获取预测类别的概率
pred_prob = probabilities[prediction]

# 返回结果
return jsonify({
'result': int(prediction),
'probability': float(pred_prob)
}), 200

except Exception as e:
print(f"预测过程中出错: {e}")
return jsonify({'error': '内部服务器错误'}), 500

@app.route('/health', methods=['GET'])
def health_check():
"""健康检查接口"""
if model is not None and vectorizer is not None:
return jsonify({'status': 'healthy', 'model_loaded': True}), 200
else:
return jsonify({'status': 'unhealthy', 'model_loaded': False}), 503

def load_or_train_model():
"""加载或训练模型"""
global model, vectorizer
try:
# 尝试加载已保存的模型
model = joblib.load('sentiment_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')
print("模型加载成功")
return True
except:
print("未找到已保存的模型,开始训练新模型...")
model, vectorizer, accuracy = train_model()
if model is not None:
# 保存模型
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print(f"模型训练完成并已保存,准确率: {accuracy:.4f}")
return True
else:
print("模型训练失败")
return False

if __name__ == '__main__':
print("="*50)
print("医疗情感分析API服务启动中...")
print("="*50)

# 加载或训练模型
if load_or_train_model():
print("模型准备就绪")
app.run(host='0.0.0.0', port=9000, debug=False)
else:
print("模型加载/训练失败,服务无法启动")

模型训练安全2

自动访问api,比较最相似图像

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import requests

import numpy as np

import os

from tqdm import tqdm

import hashlib



# 基础配置

base_url = "http://192.168.69.201:19304"

upload_url = f"{base_url}/api/upload"

poison_image_path = "image.png" # 投毒样本图像



def get_embedding(image_path):

 """通过API获取图像的嵌入向量"""

 try:

  with open(image_path, 'rb') as f:

    files = {'file': (os.path.basename(image_path), f, 'image/png')}

    response = requests.post(upload_url, files=files)

    if response.status_code == 200:

      data = response.json()

      if data.get('success'):

        return np.array(data['embedding'])

      else:

        print(f" API返回失败: {data}")

        return None

    else:

      print(f" 请求失败,状态码: {response.status_code}")

      return None

 except Exception as e:

  print(f" 获取嵌入向量时出错: {e}")

  return None



def cosine_similarity(vec1, vec2):

 """计算余弦相似度"""

 if vec1 is None or vec2 is None:

  return 0

 dot_product = np.dot(vec1, vec2)

 norm1 = np.linalg.norm(vec1)

 norm2 = np.linalg.norm(vec2)

 if norm1 == 0 or norm2 == 0:

  return 0

 return dot_product / (norm1 * norm2)



def find_most_similar_image():

 """查找与投毒样本最相似的图像"""

 print(" 正在获取投毒样本的嵌入向量...")

 poison_embedding = get_embedding(poison_image_path)

 if poison_embedding is None:

  print(" 无法获取投毒样本的嵌入向量")

  return None

 print(f" 投毒样本嵌入向量维度: {len(poison_embedding)}")

 print(" 开始处理500张医疗影像...")

 max_similarity = -1

 most_similar_id = None

 most_similar_filename = None

 # 处理500张图像

 for i in tqdm(range(1, 501)):

  image_path = f"temp{i}.png"

  if not os.path.exists(image_path):

    print(f" 文件不存在: {image_path}")

    continue

  embedding = get_embedding(image_path)

  if embedding is not None:

    similarity = cosine_similarity(poison_embedding, embedding)

    if similarity > max_similarity:

      max_similarity = similarity

      most_similar_id = i

      most_similar_filename = image_path

      print(f" 发现更高相似度: ID={i}, 相似度={similarity:.4f}")

 return most_similar_id, most_similar_filename, max_similarity



def get_patient_ct_image(patient_id):

 """从患者管理系统获取CT影像文件名"""

 # 根据题目描述,患者ID为1的CT影像文件名为:1_ca7ca8f0-dc93-42dc-9d02-ede0c07732fa.png

 # 在实际环境中,这里需要调用相应的API来获取文件名

 # 由于题目给出了具体文件名,我们直接使用

 return f"{patient_id}_ca7ca8f0-dc93-42dc-9d02-ede0c07732fa.png"



def calculate_md5(image_path):

 """计算图像的MD5哈希值"""

 try:

  with open(image_path, 'rb') as f:

    file_hash = hashlib.md5()

    chunk = f.read(8192)

    while chunk:

      file_hash.update(chunk)

      chunk = f.read(8192)

    return file_hash.hexdigest()

 except Exception as e:

  print(f" 计算MD5时出错: {e}")

  return None



def main():

 result = find_most_similar_image()

 if result is None:

  print(" 未找到相似图像")

  return

 similar_id, similar_filename, similarity_score = result

 print(f" n最相似图像信息:")

 print(f" ID: {similar_id}")

 print(f" 文件名: {similar_filename}")

 print(f" 余弦相似度: {similarity_score:.6f}")

if __name__ == "__main__":

 main()

对抗性推理1

assets/2026数信杯AI决赛wp/file-20260320163608745.png
assets/2026数信杯AI决赛wp/file-20260320163612709.png
assets/2026数信杯AI决赛wp/file-20260320163617383.png
assets/2026数信杯AI决赛wp/file-20260320163621327.png
手动修改

对抗性推理3

这里只做到第一步,第二步那个实在不会
assets/2026数信杯AI决赛wp/file-20260320160021151.png
assets/2026数信杯AI决赛wp/file-20260320160029754.png

模型隐私攻击1

这个只拿了7分,实在没时间做了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import requests

import pandas as pd

import numpy as np

import json

import time

from tqdm import tqdm



class MemberInferenceAttack:

   def __init__(self, api_url="http://192.168.69.202:19305/predict"):

      self.api_url = api_url

      self.headers = {

        "Host": "192.168.69.202:19305",

        "Accept-Language": "zh-CN,zh;q=0.9",

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",

        "Content-Type": "application/json",

        "Accept": "*/*",

        "Origin": "http://192.168.69.202:19305",

        "Referer": "http://192.168.69.202:19305/",

        "Accept-Encoding": "gzip, deflate, br",

        "Connection": "keep-alive"

      }

   def query_model(self, text, num_queries=5):

      """多次查询模型,获取置信度波动"""

      probabilities = []

      for i in range(num_queries):

        data = {"text": str(text)}

        try:

          response = requests.post(self.api_url, headers=self.headers, json=data, timeout=10)

          if response.status_code == 200:

            result = response.json()

            # 这里需要根据实际API响应调整

            # 假设API返回格式为:{"probabilities": [0.1, 0.2, 0.7]}

            if "probabilities" in result:

                probabilities.append(result["probabilities"])

            elif "score" in result:

                probabilities.append([result["score"]])

            elif "prediction" in result:

                probabilities.append([result["prediction"]])

          else:

            print(f"请求失败: {response.status_code}")

            probabilities.append([0])

        except Exception as e:

          print(f"查询异常: {e}")

          probabilities.append([0])

        time.sleep(0.1) # 避免请求过快

      if not probabilities:

        return None

      # 计算置信度波动指标

      probs_array = np.array(probabilities)

      if probs_array.ndim == 1:

        probs_array = probs_array.reshape(-1, 1)

      # 计算平均置信度

      mean_confidence = np.mean(probs_array)

      # 计算置信度标准差(波动性)

      std_confidence = np.std(probs_array)

      # 计算最大置信度

      max_confidence = np.max(probs_array)

      # 计算熵(不确定性)

      if probs_array.shape[1] > 1:

        entropy = -np.sum(probs_array * np.log(probs_array + 1e-10), axis=1).mean()

      else:

        entropy = 0

      return {

        "mean_confidence": float(mean_confidence),

        "std_confidence": float(std_confidence),

        "max_confidence": float(max_confidence),

        "entropy": float(entropy),

        "raw_probabilities": probabilities

      }

   def analyze_patients(self, patient_data_path, output_path="submit.csv"):

      """分析所有患者数据"""

      # 读取患者数据

      try:

        df = pd.read_excel(patient_data_path)

      except:

        # 如果无法读取excel,假设是CSV格式

        df = pd.read_csv(patient_data_path)

      print(f"共读取{len(df)}条患者数据")

      results = []

      for idx, row in tqdm(df.iterrows(), total=len(df)):

        patient_id = row["id"]

        patient_name = row["患者姓名"]

        phone = row["手机号"]

        # 使用手机号作为查询文本(根据任务描述)

        query_text = str(phone)

        # 查询模型

        metrics = self.query_model(query_text, num_queries=3)

        if metrics:

          # 计算成员推断得分

          # 训练数据通常有更高的置信度和更低的波动性

          member_score = metrics["mean_confidence"] * (1 - metrics["std_confidence"])

          if metrics["entropy"] > 0:

            member_score = member_score * (1 - metrics["entropy"])

          results.append({

            "id": patient_id,

            "phone": phone,

            "name": patient_name,

            "member_score": member_score,

            "mean_confidence": metrics["mean_confidence"],

            "std_confidence": metrics["std_confidence"],

            "entropy": metrics["entropy"]

          })

        else:

          results.append({

            "id": patient_id,

            "phone": phone,

            "name": patient_name,

            "member_score": 0,

            "mean_confidence": 0,

            "std_confidence": 0,

            "entropy": 0

          })

      # 按成员得分排序

      results_df = pd.DataFrame(results)

      results_sorted = results_df.sort_values("member_score", ascending=False)

      # 选择前200个作为推断的训练集成员

      top_200 = results_sorted.head(200)

      # 生成提交文件

      submit_df = pd.DataFrame({

        "id": range(1, 201), # id从1到200

        "phone": top_200["phone"].values[:200]

      })

      submit_df.to_csv(output_path, index=False, encoding="utf-8")

      print(f"提交文件已生成: {output_path}")

      print(f"选择的手机号数量: {len(submit_df)}")

      # 保存详细结果

      results_sorted.to_csv("detailed_results.csv", index=False, encoding="utf-8")

      return submit_df, results_sorted



# 主程序

if __name__ == "__main__":

   # 初始化攻击器

   attacker = MemberInferenceAttack()

   # 执行攻击

   submit_df, detailed_results = attacker.analyze_patients(

      patient_data_path="患者信息.xlsx", # 修改为实际文件路径

      output_path="submit.csv"

   )

   print("\n前10个推断为训练集的样本:")

   print(submit_df.head(10))

   print(f"\n成员得分统计:")

   print(f"最高分: {detailed_results['member_score'].max():.4f}")

   print(f"最低分: {detailed_results['member_score'].min():.4f}")

   print(f"平均分: {detailed_results['member_score'].mean():.4f}")
  • 标题: 2026数信杯AI决赛wp
  • 作者: ljnljn
  • 创建于 : 2026-03-20 19:00:00
  • 更新于 : 2026-05-25 22:04:46
  • 链接: https://ljnljn2005.github.io/2026/03/20/2026数信杯AI决赛wp/
  • 版权声明: 本文章采用 CC BY-NC-SA 4.0 进行许可。