from fastapi import APIRouter

from typing import Dict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering

from app.core.internal_funcs.determine_cluster_keyword import determine_cluster_keyword
from app.core.internal_funcs.get_data_from_webmaster import get_data_from_webmaster
from app.core.enum import SIMILARITY_THRESHOLD

router = APIRouter()

@router.get("/clusterize", response_model=Dict)
async def make_clusters(
    user: str,
    hosts: str,
    access_token: str,
    query_indicator: str,
    order_by: str
) -> Dict:
    data = get_data_from_webmaster(user, hosts, access_token, query_indicator, order_by)
    #keys = [list(d.keys())[0] for d in data]
    key_strings = []
    for row in data:
        key_strings.append(row['processed'])

    # Вычисление матрицы TF-IDF для строк
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(key_strings)

    # Выполняем кластеризацию
    threshold = SIMILARITY_THRESHOLD  # Порог схожести для кластеризации
    clustering = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='average', distance_threshold=threshold)
    cluster_labels = clustering.fit_predict(tfidf_matrix.toarray())

    clustered_data = {}

    # Группируем данные по кластерам
    for cluster_id, data_point in zip(cluster_labels, data):
        cluster_id_int = int(cluster_id)
        if cluster_id not in clustered_data:
            clustered_data[cluster_id_int] = []
        clustered_data[cluster_id_int].append(data_point)

    #result = {determine_cluster_keyword(values): values for values in clustered_data.values()}
    
    return clustered_data

