1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
| import pandas as pd import numpy as np import re import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import joblib from flask import Flask, request, jsonify from flask_cors import CORS import warnings warnings.filterwarnings('ignore') # 1. 数据预处理和模型训练函数 def preprocess_text(text): """文本预处理函数""" if not isinstance(text, str): return "" # 去除特殊字符和标点 text = re.sub(r'[^\w\s]', '', text) # 去除数字 text = re.sub(r'\d+', '', text) # 去除空白字符 text = text.strip() # 使用jieba分词 words = jieba.lcut(text) # 过滤停用词(简单版本) stop_words = ['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这'] words = [word for word in words if word not in stop_words and len(word) > 1] return ' '.join(words) def train_model(): """训练情感分析模型""" print("开始加载数据集...") try: # 加载数据集 df = pd.read_csv('data.csv') print(f"数据集加载成功,共{len(df)}条数据") print(f"数据列名: {df.columns.tolist()}") print(f"标签分布:\n{df['label'].value_counts()}") except Exception as e: print(f"加载数据集失败: {e}") return None, None, None # 数据预处理 print("开始数据预处理...") df['cleaned_review'] = df['review'].apply(preprocess_text) print(f"预处理完成,示例数据:\n{df[['review', 'cleaned_review', 'label']].head()}") # 划分训练集和测试集 X = df['cleaned_review'] y = df['label'] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f"训练集大小: {len(X_train)}, 测试集大小: {len(X_test)}") # 特征提取 - TF-IDF print("开始特征提取...") vectorizer = TfidfVectorizer( max_features=5000, ngram_range=(1, 2), min_df=3, max_df=0.9 ) X_train_tfidf = vectorizer.fit_transform(X_train) X_test_tfidf = vectorizer.transform(X_test) print(f"特征维度: {X_train_tfidf.shape}") # 训练模型 print("开始训练模型...") model = LogisticRegression( C=1.0, max_iter=1000, random_state=42, class_weight='balanced' ) model.fit(X_train_tfidf, y_train) # 评估模型 y_pred = model.predict(X_test_tfidf) accuracy = accuracy_score(y_test, y_pred) print(f"模型在测试集上的准确率: {accuracy:.4f}") return model, vectorizer, accuracy # 2. Flask API服务 app = Flask(__name__) CORS(app) # 全局变量存储模型和向量化器 model = None vectorizer = None @app.route('/predict', methods=['POST']) def predict(): """API预测接口""" try: # 获取请求数据 data = request.get_json() if not data or 'text' not in data: return jsonify({'error': '请求格式错误,需要{"text": "评论内容"}'}), 400 text = data['text'] if not text or not isinstance(text, str): return jsonify({'error': 'text字段不能为空且必须是字符串'}), 400 # 预处理文本 cleaned_text = preprocess_text(text) if not cleaned_text: # 如果预处理后为空,返回默认值 return jsonify({ 'result': 1, # 默认正向 'probability': 0.5 }), 200 # 特征提取 text_tfidf = vectorizer.transform([cleaned_text]) # 预测 prediction = model.predict(text_tfidf)[0] probabilities = model.predict_proba(text_tfidf)[0] # 获取预测类别的概率 pred_prob = probabilities[prediction] # 返回结果 return jsonify({ 'result': int(prediction), 'probability': float(pred_prob) }), 200 except Exception as e: print(f"预测过程中出错: {e}") return jsonify({'error': '内部服务器错误'}), 500 @app.route('/health', methods=['GET']) def health_check(): """健康检查接口""" if model is not None and vectorizer is not None: return jsonify({'status': 'healthy', 'model_loaded': True}), 200 else: return jsonify({'status': 'unhealthy', 'model_loaded': False}), 503 def load_or_train_model(): """加载或训练模型""" global model, vectorizer try: # 尝试加载已保存的模型 model = joblib.load('sentiment_model.pkl') vectorizer = joblib.load('tfidf_vectorizer.pkl') print("模型加载成功") return True except: print("未找到已保存的模型,开始训练新模型...") model, vectorizer, accuracy = train_model() if model is not None: # 保存模型 joblib.dump(model, 'sentiment_model.pkl') joblib.dump(vectorizer, 'tfidf_vectorizer.pkl') print(f"模型训练完成并已保存,准确率: {accuracy:.4f}") return True else: print("模型训练失败") return False if __name__ == '__main__': print("="*50) print("医疗情感分析API服务启动中...") print("="*50) # 加载或训练模型 if load_or_train_model(): print("模型准备就绪") app.run(host='0.0.0.0', port=9000, debug=False) else: print("模型加载/训练失败,服务无法启动")
|