ChatRAG/Services/SearchVectors/ChromaVectorSearchService.cs
2025-06-21 14:20:07 -03:00

651 lines
23 KiB
C#

using ChatRAG.Contracts.VectorSearch;
using ChatRAG.Models;
using Microsoft.Extensions.Options;
using System.Text;
using System.Text.Json;
using Microsoft.SemanticKernel.Embeddings;
using ChatRAG.Settings.ChatRAG.Configuration;
namespace ChatRAG.Services.SearchVectors
{
public class ChromaVectorSearchService : IVectorSearchService
{
private readonly HttpClient _httpClient;
private readonly ILogger<ChromaVectorSearchService> _logger;
private readonly ChromaSettings _settings;
private readonly string _collectionName;
public ChromaVectorSearchService(
IOptions<VectorDatabaseSettings> settings,
ILogger<ChromaVectorSearchService> logger,
HttpClient httpClient)
{
_settings = settings.Value.Chroma ?? throw new ArgumentNullException("Chroma settings not configured");
_logger = logger;
_httpClient = httpClient;
_httpClient.BaseAddress = new Uri($"http://{_settings.Host}:{_settings.Port}");
_collectionName = _settings.CollectionName;
InitializeAsync().GetAwaiter().GetResult();
}
private async Task InitializeAsync()
{
try
{
// Verificar se a collection existe, se não, criar
var collections = await GetCollectionsAsync();
if (!collections.Contains(_collectionName))
{
await CreateCollectionAsync();
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao inicializar Chroma");
throw;
}
}
// ========================================
// BUSCA VETORIAL
// ========================================
public async Task<List<VectorSearchResult>> SearchSimilarAsync(
double[] queryEmbedding,
string? projectId = null,
double threshold = 0.3,
int limit = 5,
Dictionary<string, object>? filters = null)
{
try
{
// Construir filtros WHERE
var whereClause = BuildWhereClause(projectId, filters);
var query = new
{
query_embeddings = new[] { queryEmbedding },
n_results = limit,
where = whereClause,
include = new[] { "documents", "metadatas", "distances" }
};
var json = JsonSerializer.Serialize(query);
var content = new StringContent(json, Encoding.UTF8, "application/json");
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/query", content);
if (!response.IsSuccessStatusCode)
{
var error = await response.Content.ReadAsStringAsync();
_logger.LogError("Erro na busca Chroma: {Error}", error);
return new List<VectorSearchResult>();
}
var result = await response.Content.ReadAsStringAsync();
var queryResult = JsonSerializer.Deserialize<ChromaQueryResult>(result);
return ParseQueryResults(queryResult, threshold);
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao buscar similares no Chroma");
return new List<VectorSearchResult>();
}
}
public async Task<List<VectorSearchResult>> SearchSimilarDynamicAsync(
double[] queryEmbedding,
string projectId,
double minThreshold = 0.5,
int limit = 5)
{
// Estratégia 1: Busca com threshold alto
var results = await SearchSimilarAsync(queryEmbedding, projectId, minThreshold, limit);
if (results.Count >= limit)
{
return results.Take(limit).ToList();
}
// Estratégia 2: Relaxar threshold se não conseguiu o suficiente
if (results.Count < limit && minThreshold > 0.35)
{
var mediumResults = await SearchSimilarAsync(queryEmbedding, projectId, 0.35, limit * 2);
if (mediumResults.Count >= limit)
{
return mediumResults.Take(limit).ToList();
}
results = mediumResults;
}
// Estratégia 3: Threshold baixo como último recurso
if (results.Count < limit && minThreshold > 0.2)
{
var lowResults = await SearchSimilarAsync(queryEmbedding, projectId, 0.2, limit * 3);
results = lowResults;
}
return results.Take(limit).ToList();
}
// ========================================
// CRUD DE DOCUMENTOS
// ========================================
public async Task<string> AddDocumentAsync(
string title,
string content,
string projectId,
double[] embedding,
Dictionary<string, object>? metadata = null)
{
try
{
var documentId = Guid.NewGuid().ToString();
var combinedMetadata = new Dictionary<string, object>
{
["title"] = title,
["project_id"] = projectId,
["created_at"] = DateTime.UtcNow.ToString("O")
};
if (metadata != null)
{
foreach (var kvp in metadata)
{
combinedMetadata[kvp.Key] = kvp.Value;
}
}
var document = new
{
ids = new[] { documentId },
documents = new[] { content },
metadatas = new[] { combinedMetadata },
embeddings = new[] { embedding }
};
var json = JsonSerializer.Serialize(document);
var requestContent = new StringContent(json, Encoding.UTF8, "application/json");
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/add", requestContent);
if (!response.IsSuccessStatusCode)
{
var error = await response.Content.ReadAsStringAsync();
throw new Exception($"Erro ao adicionar documento: {error}");
}
return documentId;
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao adicionar documento no Chroma");
throw;
}
}
public async Task UpdateDocumentAsync(
string id,
string title,
string content,
string projectId,
double[] embedding,
Dictionary<string, object>? metadata = null)
{
try
{
// Chroma não tem update direto, então fazemos delete + add
await DeleteDocumentAsync(id);
var combinedMetadata = new Dictionary<string, object>
{
["title"] = title,
["project_id"] = projectId,
["updated_at"] = DateTime.UtcNow.ToString("O")
};
if (metadata != null)
{
foreach (var kvp in metadata)
{
combinedMetadata[kvp.Key] = kvp.Value;
}
}
var document = new
{
ids = new[] { id },
documents = new[] { content },
metadatas = new[] { combinedMetadata },
embeddings = new[] { embedding }
};
var json = JsonSerializer.Serialize(document);
var requestContent = new StringContent(json, Encoding.UTF8, "application/json");
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/upsert", requestContent);
if (!response.IsSuccessStatusCode)
{
var error = await response.Content.ReadAsStringAsync();
throw new Exception($"Erro ao atualizar documento: {error}");
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao atualizar documento no Chroma");
throw;
}
}
public async Task DeleteDocumentAsync(string id)
{
try
{
var deleteRequest = new
{
ids = new[] { id }
};
var json = JsonSerializer.Serialize(deleteRequest);
var content = new StringContent(json, Encoding.UTF8, "application/json");
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/delete", content);
if (!response.IsSuccessStatusCode)
{
var error = await response.Content.ReadAsStringAsync();
_logger.LogWarning("Erro ao deletar documento {Id}: {Error}", id, error);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao deletar documento {Id} no Chroma", id);
throw;
}
}
// ========================================
// CONSULTAS AUXILIARES
// ========================================
public async Task<bool> DocumentExistsAsync(string id)
{
try
{
var doc = await GetDocumentAsync(id);
return doc != null;
}
catch
{
return false;
}
}
public async Task<VectorSearchResult?> GetDocumentAsync(string id)
{
try
{
var query = new
{
ids = new[] { id },
include = new[] { "documents", "metadatas" }
};
var json = JsonSerializer.Serialize(query);
var content = new StringContent(json, Encoding.UTF8, "application/json");
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/get", content);
if (!response.IsSuccessStatusCode)
{
return null;
}
var result = await response.Content.ReadAsStringAsync();
var getResult = JsonSerializer.Deserialize<ChromaGetResult>(result);
if (getResult?.ids?.Length > 0)
{
return new VectorSearchResult
{
Id = getResult.ids[0],
Content = getResult.documents?[0] ?? "",
Score = 1.0,
Metadata = getResult.metadatas?[0]
};
}
return null;
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao buscar documento {Id} no Chroma", id);
return null;
}
}
public async Task<List<VectorSearchResult>> GetDocumentsByProjectAsync(string projectId)
{
try
{
var query = new
{
where = new { project_id = projectId },
include = new[] { "documents", "metadatas" }
};
var json = JsonSerializer.Serialize(query);
var content = new StringContent(json, Encoding.UTF8, "application/json");
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/get", content);
if (!response.IsSuccessStatusCode)
{
var error = await response.Content.ReadAsStringAsync();
_logger.LogError("Erro ao buscar documentos do projeto {ProjectId}: {Error}", projectId, error);
return new List<VectorSearchResult>();
}
var result = await response.Content.ReadAsStringAsync();
var getResult = JsonSerializer.Deserialize<ChromaGetResult>(result);
var results = new List<VectorSearchResult>();
if (getResult?.documents?.Length > 0)
{
for (int i = 0; i < getResult.documents.Length; i++)
{
results.Add(new VectorSearchResult
{
Id = getResult.ids[i],
Content = getResult.documents[i],
Score = 1.0, // Todos os documentos do projeto
Metadata = getResult.metadatas?[i]
});
}
}
return results;
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao buscar documentos do projeto {ProjectId} no Chroma", projectId);
return new List<VectorSearchResult>();
}
}
public async Task<int> GetDocumentCountAsync(string? projectId = null)
{
try
{
var query = new
{
where = projectId != null ? new { project_id = projectId } : null
};
var json = JsonSerializer.Serialize(query);
var content = new StringContent(json, Encoding.UTF8, "application/json");
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/count", content);
if (!response.IsSuccessStatusCode)
{
_logger.LogWarning("Erro ao contar documentos no Chroma");
return 0;
}
var result = await response.Content.ReadAsStringAsync();
var countResult = JsonSerializer.Deserialize<ChromaCountResult>(result);
return countResult?.count ?? 0;
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao contar documentos no Chroma");
return 0;
}
}
// ========================================
// HEALTH CHECK E MÉTRICAS
// ========================================
public async Task<bool> IsHealthyAsync()
{
try
{
var response = await _httpClient.GetAsync("/api/v1/heartbeat");
return response.IsSuccessStatusCode;
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro no health check do Chroma");
return false;
}
}
public async Task<Dictionary<string, object>> GetStatsAsync()
{
try
{
var stats = new Dictionary<string, object>
{
["provider"] = "Chroma",
["collection"] = _collectionName,
["host"] = _settings.Host,
["port"] = _settings.Port
};
// Tentar obter informações da collection
var response = await _httpClient.GetAsync($"/api/v1/collections/{_collectionName}");
if (response.IsSuccessStatusCode)
{
var content = await response.Content.ReadAsStringAsync();
var collectionInfo = JsonSerializer.Deserialize<Dictionary<string, object>>(content);
if (collectionInfo != null)
{
stats["collection_info"] = collectionInfo;
}
}
// Contar documentos totais
stats["total_documents"] = await GetDocumentCountAsync();
return stats;
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao obter stats do Chroma");
return new Dictionary<string, object>
{
["provider"] = "Chroma",
["error"] = ex.Message,
["status"] = "error"
};
}
}
// ========================================
// MÉTODOS AUXILIARES PRIVADOS
// ========================================
private async Task<string[]> GetCollectionsAsync()
{
try
{
var response = await _httpClient.GetAsync("/api/v1/collections");
if (!response.IsSuccessStatusCode)
{
_logger.LogWarning("Erro ao obter collections: {StatusCode}", response.StatusCode);
return Array.Empty<string>();
}
var content = await response.Content.ReadAsStringAsync();
// Tentar desserializar como array de strings (versão simples)
try
{
var collections = JsonSerializer.Deserialize<string[]>(content);
return collections ?? Array.Empty<string>();
}
catch
{
// Tentar desserializar como array de objetos (versão mais nova)
try
{
var collectionsObj = JsonSerializer.Deserialize<CollectionInfo[]>(content);
return collectionsObj?.Select(c => c.name).ToArray() ?? Array.Empty<string>();
}
catch
{
_logger.LogWarning("Não foi possível parsear lista de collections");
return Array.Empty<string>();
}
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao buscar collections");
return Array.Empty<string>();
}
}
// Classe auxiliar para desserialização
private class CollectionInfo
{
public string name { get; set; } = "";
public Dictionary<string, object>? metadata { get; set; }
}
private async Task CreateCollectionAsync()
{
var collection = new
{
name = _collectionName,
metadata = new
{
description = "RAG Collection",
created_at = DateTime.UtcNow.ToString("O")
}
};
var json = JsonSerializer.Serialize(collection);
var content = new StringContent(json, Encoding.UTF8, "application/json");
// Tentar primeira abordagem (versão mais nova)
var response = await _httpClient.PostAsync("/api/v1/collections", content);
// Se falhar, tentar segunda abordagem (criar collection via get_or_create)
if (!response.IsSuccessStatusCode)
{
_logger.LogWarning("Método POST falhou, tentando abordagem alternativa");
// Criar usando get_or_create approach
var createPayload = new
{
name = _collectionName,
metadata = new
{
description = "RAG Collection",
created_at = DateTime.UtcNow.ToString("O")
},
get_or_create = true
};
var createJson = JsonSerializer.Serialize(createPayload);
var createContent = new StringContent(createJson, Encoding.UTF8, "application/json");
var createResponse = await _httpClient.PostAsync("/api/v1/collections", createContent);
if (!createResponse.IsSuccessStatusCode)
{
var error = await createResponse.Content.ReadAsStringAsync();
_logger.LogError("Erro ao criar collection: {Error}", error);
// Última tentativa: assumir que collection já existe
_logger.LogWarning("Assumindo que collection {CollectionName} já existe", _collectionName);
return;
}
}
_logger.LogInformation("Collection {CollectionName} criada/verificada com sucesso", _collectionName);
}
private object? BuildWhereClause(string? projectId, Dictionary<string, object>? filters)
{
var where = new Dictionary<string, object>();
if (!string.IsNullOrEmpty(projectId))
{
where["project_id"] = projectId;
}
if (filters != null)
{
foreach (var filter in filters)
{
where[filter.Key] = filter.Value;
}
}
return where.Any() ? where : null;
}
private List<VectorSearchResult> ParseQueryResults(ChromaQueryResult? queryResult, double threshold)
{
var results = new List<VectorSearchResult>();
if (queryResult?.documents?.Length > 0 && queryResult.documents[0].Length > 0)
{
for (int i = 0; i < queryResult.documents[0].Length; i++)
{
var distance = queryResult.distances?[0][i] ?? 1.0;
// Chroma retorna distâncias, converter para similaridade (1 - distance)
var similarity = 1.0 - distance;
if (similarity >= threshold)
{
results.Add(new VectorSearchResult
{
Id = queryResult.ids[0][i],
Content = queryResult.documents[0][i],
Score = similarity,
Metadata = queryResult.metadatas?[0][i]
});
}
}
}
return results.OrderByDescending(r => r.Score).ToList();
}
}
// ========================================
// DTOs PARA CHROMA API
// ========================================
public class ChromaQueryResult
{
public string[][] ids { get; set; } = Array.Empty<string[]>();
public string[][] documents { get; set; } = Array.Empty<string[]>();
public double[][]? distances { get; set; }
public Dictionary<string, object>[][]? metadatas { get; set; }
}
public class ChromaGetResult
{
public string[] ids { get; set; } = Array.Empty<string>();
public string[] documents { get; set; } = Array.Empty<string>();
public Dictionary<string, object>[]? metadatas { get; set; }
}
public class ChromaCountResult
{
public int count { get; set; }
}
}