CulturalValues/Выявление культурных ценностей в отзывах.py

#!/usr/bin/env python
# coding: utf-8

# ### Загрузка исходной базы данных

# In[1]:


import pandas as pd

df = pd.read_csv('База_данных_отзывы.csv', delimiter=';',header=None, usecols=[10]) # выбираем нужные столбцы
df.columns = ['Отзывы']
documents = df['Отзывы'].dropna().tolist()
print(documents)


# In[2]:


documents.pop(0)
print(documents[0])


# ### Предобработка данных

# In[3]:


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Загрузка стоп-слов и пунктуации
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('russian'))
punctuation = set(string.punctuation)

preprocessed_text = []

def preprocess_text(text):
    words = word_tokenize(text.lower())  # Привести к нижнему регистру и токенизировать
    filtered_words = [word for word in words if word not in stop_words and word not in punctuation]
    return " ".join(filtered_words)

# Пример предобработки текстовых данных
for i in documents:
    text = i
    preprocessed_text.append(preprocess_text(text))
print(preprocessed_text[10])


# In[4]:


import spacy
from spacy import load
from spacy.lang.ru.examples import sentences
from spacy.lang.ru import Russian

preprocessed_lemmatized_text = []

# Загрузка языковой модели spaCy
nlp = spacy.load("ru_core_news_md")

def preprocess_and_lemmatize(text):
    doc = nlp(text.lower())  # Привести к нижнему регистру и лемматизировать
    lemmatized_words = [token.lemma_ for token in doc if token.text not in punctuation and token.text not in stop_words]
    return " ".join(lemmatized_words)

# Пример предобработки и лемматизации текста
for word in preprocessed_text:
    text = word
    preprocessed_lemmatized_text.append(preprocess_and_lemmatize(text))
print(preprocessed_lemmatized_text[10])


# In[6]:


print(len(preprocessed_lemmatized_text))


# In[17]:


print(preprocessed_lemmatized_text[940:950])


# In[37]:


import csv
with open('out.csv', 'w', encoding="utf-8", newline='') as file:
    writer = csv.writer(file)
    writer.writerow(preprocessed_lemmatized_text) #сохранение в csv файл лемматизированные отзывы


# ### Формирование TF-IDF матрицы ключевых слов

# In[5]:


from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Пример текстовых данных


# Создание объекта TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Применение TF-IDF к текстовым данным
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_lemmatized_text)

# Получение списка ключевых слов и их значения TF-IDF для первого документа
feature_names = tfidf_vectorizer.get_feature_names()
tfidf_scores = tfidf_matrix.toarray()[0]

# Сортировка слов по значениям TF-IDF
sorted_keywords = [word for _, word in sorted(zip(tfidf_scores, feature_names), reverse=True)]

print("Ключевые слова:", sorted_keywords)


# In[9]:


tfidf_matrix.shape


# In[9]:


print(tfidf_matrix)


# ### Преобразование столбца "Ценности" в числовой список для классификатора

# In[23]:


tfidf_matrix.shape


# In[24]:


tsvd_data2D.shape


# In[6]:


import pandas as pd

df = pd.read_csv('База_данных_отзывы.csv', delimiter=';',header=0, usecols=[10,17]) # выбираем нужные столбцы


# In[7]:


df = df.dropna()


# In[8]:


df["Ценности"] = df["Ценности"].replace({'релевантно':1, 'не определено':0 })


# In[9]:


values = df['Ценности'].tolist()


# In[16]:


print(values[940:950])


# In[38]:


df["Ценности"].sum()


# ### Классификатор случайного леса

# #### Использование матрицы  TF-IDF обычной размерности

# In[10]:


from sklearn.model_selection import train_test_split

x = tfidf_matrix
y = values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)


# In[11]:


from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=3000,max_depth=2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


# In[13]:


from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))


# In[12]:


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(y_test, y_pred))


# In[36]:


import numpy as np
np.array(y_test).sum()


# In[41]:


y_pred.shape


# In[43]:


y_pred


# In[42]:


y_pred.sum()


# In[22]:


tsvd_data2D.shape


# #### Использование матрицы  TF-IDF пониженной размерности

# In[14]:


from sklearn.decomposition import TruncatedSVD
## Понижение размерности
tsvd2D = TruncatedSVD(n_components=1024)
tsvd_data2D = tsvd2D.fit_transform(tfidf_matrix)


# In[15]:


from sklearn.model_selection import train_test_split

x = tsvd_data2D
y = values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=2000,max_depth=2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


# In[17]:


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(y_test, y_pred))


# In[16]:


from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))


# In[65]:


y_pred.sum()


# ### Метод K-ближайших соседей

# #### Использование матрицы  TF-IDF обычной размерности

# In[221]:


from sklearn.model_selection import train_test_split

x = tfidf_matrix
y = values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)


# In[222]:


from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors = 1024)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)


# In[224]:


from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(sum(y_pred), y_pred.shape)


# In[215]:


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(y_test, y_pred))


# In[223]:


print(sum(y_pred), y_pred.shape)


# #### Использование матрицы  TF-IDF пониженной размерности

# In[25]:


# возьмем в качестве X матрицу пониженной размерности
from sklearn.model_selection import train_test_split

x = tsvd_data2D
y = values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)

from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors = 1024)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)


# In[226]:


from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(sum(y_pred), y_pred.shape)


# In[227]:


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(y_test, y_pred))


# ### Логистическая регрессия

# #### Использование матрицы  TF-IDF обычной размерности

# In[232]:


from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
x = tfidf_matrix
y = values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=90)
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(sum(y_pred), y_pred.shape)


# In[233]:


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(y_test, y_pred))


# #### Использование матрицы  TF-IDF пониженной размерности

# In[234]:


## возьмем в качестве X матрицу пониженной размерности
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
x = tsvd_data2D
y = values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=90)
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(sum(y_pred), y_pred.shape)


# In[235]:


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(y_test, y_pred))


# ### Тематическое моделирование

# #### Неотрицательная матричная факторизация (NMF)

# In[19]:


from sklearn.decomposition import NMF
import csv
# Подготовка данных
X = tfidf_matrix

# Применение NMF
num_topics = 1000
nmf = NMF(n_components=num_topics, random_state=42)
nmf.fit(X)

# Вывод слов для каждой темы
feature_names = tfidf_vectorizer.get_feature_names()
with open('Тематическое_моделирование_NMF.csv', 'w', encoding="utf-8", newline='') as file:
        writer = csv.writer(file)
        for topic_idx, topic_words in enumerate(nmf.components_):
            top_words_idx = topic_words.argsort()[-10:][::-1]
            top_words = [feature_names[i] for i in top_words_idx]
            writer.writerow(top_words)


# In[20]:


nmf_matrix = nmf.fit_transform(X)


# In[50]:


nmf_tr = nmf_matrix.T


# In[107]:


trans = nmf_tr.T
print(trans) # наша первоначальная матрица с уверенностью, что отзыв принадлежит теме


# In[37]:


tm = pd.DataFrame(nmf_matrix, columns=['Тема№'+f'{i}' for i in range(1,1001)])
tm.head(10)


# In[ ]:


# In[38]:


tm.insert(loc = 1000,
          column = 'Ценности',
          value = values)


# In[39]:


tm.head(10)


# In[71]:


q = tm[(tm['Ценности'] == 1)]
q


# In[72]:


q.shape


# In[78]:


name = list(tm.columns)
name.pop(-1)
w = q[name]


# In[83]:


w


# In[92]:


res = []
for column in w.columns:

    # Storing the rows of a column
    # into a temporary list
    li = w[column].tolist()

    # appending the temporary list
    res.append(li)
print(len(res))


# In[91]:


t = [sum(nmf_tr[i]) for i in range(1000)] # суммы по темам по всем отзывам
print(t)


# In[94]:


sum_1 = [sum(res[i]) for i in range(1000)] # суммы по темам по релевантным отзывам
print(sum_1)


# In[152]:


import numpy as np
p = np.array(sum_1)/np.array(t)
print(p) # средняя вероятность по всем релевантным отзывам по темам, т.е. вероятность того, что тема соответствует культурной ценности


# In[237]:


pik = p[p > 0.9] # выберем те темы, где вероятность больше 0.9
print(pik)


# In[246]:


inde = []

for i in pik: #
    inde.append(p.tolist().index(i))
print(inde)


# In[247]:


with open('Тематическое_моделирование_NMF.csv',encoding="utf-8") as fd:
    reader=csv.reader(fd)
    interestingrows=[row for idx, row in enumerate(reader) if idx in inde]


# In[249]:


for i in interestingrows:
    print(i) #рассмотрим, что это за темы по ключевым словам


# In[173]:


ind = [i for i in range(1,1001)]


# In[181]:


from matplotlib.backends.backend_pdf import PdfPages


# In[191]:


fig = plt.figure(figsize=(50,20))
plt.xlabel('Номер Темы')
plt.ylabel('Вероятность соотношения темы с ценностями')
plt.bar(ind,p)
#plt.show()
pdf = PdfPages("Гистограмма.pdf")
pdf.savefig(fig)


# Сохранение файла
pdf.close()


# In[183]:


min_v = min(p)
max_v = max(p)
print(min_v, max_v)


# In[153]:


itog = []
for i in trans:
    y = np.array(i)*np.array(p)
    itog.append(y)
print(itog)


# In[154]:


np.array(itog).shape


# In[155]:


da = pd.DataFrame(np.array(itog), columns=['Тема№'+f'{i}' for i in range(1,1001)])
da.head(10) # итоговый вариант


# ### Кластеризация и категоризация текстовых данных

# In[42]:


from sklearn.cluster import KMeans
import numpy as np

import csv
with open('кластеры_отзывов.csv', 'w', encoding="utf-8", newline='') as file:
    writer = csv.writer(file)
    writer.writerow(preprocessed_lemmatized_text)

# Применение кластеризации KMeans к матрице TF-IDF
num_clusters = 100
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(tfidf_matrix)
klaster = []
with open('кластеры_отзывов.csv', 'w', encoding="utf-8", newline='') as file:
    writer = csv.writer(file)
# Показать примеры документов в каждом кластере
    for cluster_id in range(num_clusters):
        cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
        print(f"Кластер {cluster_id + 1}:")
        for idx in cluster_indices:
            print(documents[idx])
            klaster.append(documents[idx])
        writer.writerow(klaster)
        klaster = []
        print("--------")


# In[41]:


print(documents[0])


# ### Синонимы

# In[18]:


import bs4 as bs
import urllib.request
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
# Объединяем вcе отзывы в переменной article_text.
article_text = ""
for p in documents: article_text += p
# переводим все символы в нижний регистр.
cleaned_article = article_text.lower()
# Оставляем только буквы и убираем пробелы, используя регулярные выражения.
cleaned_article = re.sub('[^a-я]', ' ', cleaned_article)
cleaned_article = re.sub(r'\s+', ' ', cleaned_article)
# Готовим датасет для обучения
all_sentences = nltk.sent_tokenize(cleaned_article)
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]
# Проходимся по датасету и удаляем стоп-слова
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('russian')]
#Создаем модель Word2Vec со словами, чаще всего встречающимися в тексте
word2vec = Word2Vec(all_words, min_count=3)
print(word2vec.wv.most_similar('справедливость', topn=5))


# In[23]:


print(word2vec.wv.most_similar('труд', topn=100))


# In[122]:


import csv
with open('словарь_слов_из_отзывов.csv', 'w', encoding="utf-8", newline='') as file:
    writer = csv.writer(file)
    writer.writerow(all_words)