651 lines
23 KiB
C#
651 lines
23 KiB
C#
using ChatRAG.Contracts.VectorSearch;
|
|
using ChatRAG.Models;
|
|
using Microsoft.Extensions.Options;
|
|
using System.Text;
|
|
using System.Text.Json;
|
|
using Microsoft.SemanticKernel.Embeddings;
|
|
using ChatRAG.Settings.ChatRAG.Configuration;
|
|
|
|
namespace ChatRAG.Services.SearchVectors
|
|
{
|
|
public class ChromaVectorSearchService : IVectorSearchService
|
|
{
|
|
private readonly HttpClient _httpClient;
|
|
private readonly ILogger<ChromaVectorSearchService> _logger;
|
|
private readonly ChromaSettings _settings;
|
|
private readonly string _collectionName;
|
|
|
|
public ChromaVectorSearchService(
|
|
IOptions<VectorDatabaseSettings> settings,
|
|
ILogger<ChromaVectorSearchService> logger,
|
|
HttpClient httpClient)
|
|
{
|
|
_settings = settings.Value.Chroma ?? throw new ArgumentNullException("Chroma settings not configured");
|
|
_logger = logger;
|
|
_httpClient = httpClient;
|
|
_httpClient.BaseAddress = new Uri($"http://{_settings.Host}:{_settings.Port}");
|
|
_collectionName = _settings.CollectionName;
|
|
|
|
InitializeAsync().GetAwaiter().GetResult();
|
|
}
|
|
|
|
private async Task InitializeAsync()
|
|
{
|
|
try
|
|
{
|
|
// Verificar se a collection existe, se não, criar
|
|
var collections = await GetCollectionsAsync();
|
|
if (!collections.Contains(_collectionName))
|
|
{
|
|
await CreateCollectionAsync();
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao inicializar Chroma");
|
|
throw;
|
|
}
|
|
}
|
|
|
|
// ========================================
|
|
// BUSCA VETORIAL
|
|
// ========================================
|
|
|
|
public async Task<List<VectorSearchResult>> SearchSimilarAsync(
|
|
double[] queryEmbedding,
|
|
string? projectId = null,
|
|
double threshold = 0.3,
|
|
int limit = 5,
|
|
Dictionary<string, object>? filters = null)
|
|
{
|
|
try
|
|
{
|
|
// Construir filtros WHERE
|
|
var whereClause = BuildWhereClause(projectId, filters);
|
|
|
|
var query = new
|
|
{
|
|
query_embeddings = new[] { queryEmbedding },
|
|
n_results = limit,
|
|
where = whereClause,
|
|
include = new[] { "documents", "metadatas", "distances" }
|
|
};
|
|
|
|
var json = JsonSerializer.Serialize(query);
|
|
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/query", content);
|
|
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
var error = await response.Content.ReadAsStringAsync();
|
|
_logger.LogError("Erro na busca Chroma: {Error}", error);
|
|
return new List<VectorSearchResult>();
|
|
}
|
|
|
|
var result = await response.Content.ReadAsStringAsync();
|
|
var queryResult = JsonSerializer.Deserialize<ChromaQueryResult>(result);
|
|
|
|
return ParseQueryResults(queryResult, threshold);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao buscar similares no Chroma");
|
|
return new List<VectorSearchResult>();
|
|
}
|
|
}
|
|
|
|
public async Task<List<VectorSearchResult>> SearchSimilarDynamicAsync(
|
|
double[] queryEmbedding,
|
|
string projectId,
|
|
double minThreshold = 0.5,
|
|
int limit = 5)
|
|
{
|
|
// Estratégia 1: Busca com threshold alto
|
|
var results = await SearchSimilarAsync(queryEmbedding, projectId, minThreshold, limit);
|
|
|
|
if (results.Count >= limit)
|
|
{
|
|
return results.Take(limit).ToList();
|
|
}
|
|
|
|
// Estratégia 2: Relaxar threshold se não conseguiu o suficiente
|
|
if (results.Count < limit && minThreshold > 0.35)
|
|
{
|
|
var mediumResults = await SearchSimilarAsync(queryEmbedding, projectId, 0.35, limit * 2);
|
|
if (mediumResults.Count >= limit)
|
|
{
|
|
return mediumResults.Take(limit).ToList();
|
|
}
|
|
results = mediumResults;
|
|
}
|
|
|
|
// Estratégia 3: Threshold baixo como último recurso
|
|
if (results.Count < limit && minThreshold > 0.2)
|
|
{
|
|
var lowResults = await SearchSimilarAsync(queryEmbedding, projectId, 0.2, limit * 3);
|
|
results = lowResults;
|
|
}
|
|
|
|
return results.Take(limit).ToList();
|
|
}
|
|
|
|
// ========================================
|
|
// CRUD DE DOCUMENTOS
|
|
// ========================================
|
|
|
|
public async Task<string> AddDocumentAsync(
|
|
string title,
|
|
string content,
|
|
string projectId,
|
|
double[] embedding,
|
|
Dictionary<string, object>? metadata = null)
|
|
{
|
|
try
|
|
{
|
|
var documentId = Guid.NewGuid().ToString();
|
|
|
|
var combinedMetadata = new Dictionary<string, object>
|
|
{
|
|
["title"] = title,
|
|
["project_id"] = projectId,
|
|
["created_at"] = DateTime.UtcNow.ToString("O")
|
|
};
|
|
|
|
if (metadata != null)
|
|
{
|
|
foreach (var kvp in metadata)
|
|
{
|
|
combinedMetadata[kvp.Key] = kvp.Value;
|
|
}
|
|
}
|
|
|
|
var document = new
|
|
{
|
|
ids = new[] { documentId },
|
|
documents = new[] { content },
|
|
metadatas = new[] { combinedMetadata },
|
|
embeddings = new[] { embedding }
|
|
};
|
|
|
|
var json = JsonSerializer.Serialize(document);
|
|
var requestContent = new StringContent(json, Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/add", requestContent);
|
|
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
var error = await response.Content.ReadAsStringAsync();
|
|
throw new Exception($"Erro ao adicionar documento: {error}");
|
|
}
|
|
|
|
return documentId;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao adicionar documento no Chroma");
|
|
throw;
|
|
}
|
|
}
|
|
|
|
public async Task UpdateDocumentAsync(
|
|
string id,
|
|
string title,
|
|
string content,
|
|
string projectId,
|
|
double[] embedding,
|
|
Dictionary<string, object>? metadata = null)
|
|
{
|
|
try
|
|
{
|
|
// Chroma não tem update direto, então fazemos delete + add
|
|
await DeleteDocumentAsync(id);
|
|
|
|
var combinedMetadata = new Dictionary<string, object>
|
|
{
|
|
["title"] = title,
|
|
["project_id"] = projectId,
|
|
["updated_at"] = DateTime.UtcNow.ToString("O")
|
|
};
|
|
|
|
if (metadata != null)
|
|
{
|
|
foreach (var kvp in metadata)
|
|
{
|
|
combinedMetadata[kvp.Key] = kvp.Value;
|
|
}
|
|
}
|
|
|
|
var document = new
|
|
{
|
|
ids = new[] { id },
|
|
documents = new[] { content },
|
|
metadatas = new[] { combinedMetadata },
|
|
embeddings = new[] { embedding }
|
|
};
|
|
|
|
var json = JsonSerializer.Serialize(document);
|
|
var requestContent = new StringContent(json, Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/upsert", requestContent);
|
|
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
var error = await response.Content.ReadAsStringAsync();
|
|
throw new Exception($"Erro ao atualizar documento: {error}");
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao atualizar documento no Chroma");
|
|
throw;
|
|
}
|
|
}
|
|
|
|
public async Task DeleteDocumentAsync(string id)
|
|
{
|
|
try
|
|
{
|
|
var deleteRequest = new
|
|
{
|
|
ids = new[] { id }
|
|
};
|
|
|
|
var json = JsonSerializer.Serialize(deleteRequest);
|
|
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/delete", content);
|
|
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
var error = await response.Content.ReadAsStringAsync();
|
|
_logger.LogWarning("Erro ao deletar documento {Id}: {Error}", id, error);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao deletar documento {Id} no Chroma", id);
|
|
throw;
|
|
}
|
|
}
|
|
|
|
// ========================================
|
|
// CONSULTAS AUXILIARES
|
|
// ========================================
|
|
|
|
public async Task<bool> DocumentExistsAsync(string id)
|
|
{
|
|
try
|
|
{
|
|
var doc = await GetDocumentAsync(id);
|
|
return doc != null;
|
|
}
|
|
catch
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public async Task<VectorSearchResult?> GetDocumentAsync(string id)
|
|
{
|
|
try
|
|
{
|
|
var query = new
|
|
{
|
|
ids = new[] { id },
|
|
include = new[] { "documents", "metadatas" }
|
|
};
|
|
|
|
var json = JsonSerializer.Serialize(query);
|
|
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/get", content);
|
|
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
var result = await response.Content.ReadAsStringAsync();
|
|
var getResult = JsonSerializer.Deserialize<ChromaGetResult>(result);
|
|
|
|
if (getResult?.ids?.Length > 0)
|
|
{
|
|
return new VectorSearchResult
|
|
{
|
|
Id = getResult.ids[0],
|
|
Content = getResult.documents?[0] ?? "",
|
|
Score = 1.0,
|
|
Metadata = getResult.metadatas?[0]
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao buscar documento {Id} no Chroma", id);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public async Task<List<VectorSearchResult>> GetDocumentsByProjectAsync(string projectId)
|
|
{
|
|
try
|
|
{
|
|
var query = new
|
|
{
|
|
where = new { project_id = projectId },
|
|
include = new[] { "documents", "metadatas" }
|
|
};
|
|
|
|
var json = JsonSerializer.Serialize(query);
|
|
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/get", content);
|
|
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
var error = await response.Content.ReadAsStringAsync();
|
|
_logger.LogError("Erro ao buscar documentos do projeto {ProjectId}: {Error}", projectId, error);
|
|
return new List<VectorSearchResult>();
|
|
}
|
|
|
|
var result = await response.Content.ReadAsStringAsync();
|
|
var getResult = JsonSerializer.Deserialize<ChromaGetResult>(result);
|
|
|
|
var results = new List<VectorSearchResult>();
|
|
|
|
if (getResult?.documents?.Length > 0)
|
|
{
|
|
for (int i = 0; i < getResult.documents.Length; i++)
|
|
{
|
|
results.Add(new VectorSearchResult
|
|
{
|
|
Id = getResult.ids[i],
|
|
Content = getResult.documents[i],
|
|
Score = 1.0, // Todos os documentos do projeto
|
|
Metadata = getResult.metadatas?[i]
|
|
});
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao buscar documentos do projeto {ProjectId} no Chroma", projectId);
|
|
return new List<VectorSearchResult>();
|
|
}
|
|
}
|
|
|
|
public async Task<int> GetDocumentCountAsync(string? projectId = null)
|
|
{
|
|
try
|
|
{
|
|
var query = new
|
|
{
|
|
where = projectId != null ? new { project_id = projectId } : null
|
|
};
|
|
|
|
var json = JsonSerializer.Serialize(query);
|
|
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.PostAsync($"/api/v1/collections/{_collectionName}/count", content);
|
|
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
_logger.LogWarning("Erro ao contar documentos no Chroma");
|
|
return 0;
|
|
}
|
|
|
|
var result = await response.Content.ReadAsStringAsync();
|
|
var countResult = JsonSerializer.Deserialize<ChromaCountResult>(result);
|
|
return countResult?.count ?? 0;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao contar documentos no Chroma");
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// ========================================
|
|
// HEALTH CHECK E MÉTRICAS
|
|
// ========================================
|
|
|
|
public async Task<bool> IsHealthyAsync()
|
|
{
|
|
try
|
|
{
|
|
var response = await _httpClient.GetAsync("/api/v1/heartbeat");
|
|
return response.IsSuccessStatusCode;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro no health check do Chroma");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public async Task<Dictionary<string, object>> GetStatsAsync()
|
|
{
|
|
try
|
|
{
|
|
var stats = new Dictionary<string, object>
|
|
{
|
|
["provider"] = "Chroma",
|
|
["collection"] = _collectionName,
|
|
["host"] = _settings.Host,
|
|
["port"] = _settings.Port
|
|
};
|
|
|
|
// Tentar obter informações da collection
|
|
var response = await _httpClient.GetAsync($"/api/v1/collections/{_collectionName}");
|
|
if (response.IsSuccessStatusCode)
|
|
{
|
|
var content = await response.Content.ReadAsStringAsync();
|
|
var collectionInfo = JsonSerializer.Deserialize<Dictionary<string, object>>(content);
|
|
if (collectionInfo != null)
|
|
{
|
|
stats["collection_info"] = collectionInfo;
|
|
}
|
|
}
|
|
|
|
// Contar documentos totais
|
|
stats["total_documents"] = await GetDocumentCountAsync();
|
|
|
|
return stats;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao obter stats do Chroma");
|
|
return new Dictionary<string, object>
|
|
{
|
|
["provider"] = "Chroma",
|
|
["error"] = ex.Message,
|
|
["status"] = "error"
|
|
};
|
|
}
|
|
}
|
|
|
|
// ========================================
|
|
// MÉTODOS AUXILIARES PRIVADOS
|
|
// ========================================
|
|
|
|
private async Task<string[]> GetCollectionsAsync()
|
|
{
|
|
try
|
|
{
|
|
var response = await _httpClient.GetAsync("/api/v1/collections");
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
_logger.LogWarning("Erro ao obter collections: {StatusCode}", response.StatusCode);
|
|
return Array.Empty<string>();
|
|
}
|
|
|
|
var content = await response.Content.ReadAsStringAsync();
|
|
|
|
// Tentar desserializar como array de strings (versão simples)
|
|
try
|
|
{
|
|
var collections = JsonSerializer.Deserialize<string[]>(content);
|
|
return collections ?? Array.Empty<string>();
|
|
}
|
|
catch
|
|
{
|
|
// Tentar desserializar como array de objetos (versão mais nova)
|
|
try
|
|
{
|
|
var collectionsObj = JsonSerializer.Deserialize<CollectionInfo[]>(content);
|
|
return collectionsObj?.Select(c => c.name).ToArray() ?? Array.Empty<string>();
|
|
}
|
|
catch
|
|
{
|
|
_logger.LogWarning("Não foi possível parsear lista de collections");
|
|
return Array.Empty<string>();
|
|
}
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Erro ao buscar collections");
|
|
return Array.Empty<string>();
|
|
}
|
|
}
|
|
|
|
// Classe auxiliar para desserialização
|
|
private class CollectionInfo
|
|
{
|
|
public string name { get; set; } = "";
|
|
public Dictionary<string, object>? metadata { get; set; }
|
|
}
|
|
|
|
private async Task CreateCollectionAsync()
|
|
{
|
|
var collection = new
|
|
{
|
|
name = _collectionName,
|
|
metadata = new
|
|
{
|
|
description = "RAG Collection",
|
|
created_at = DateTime.UtcNow.ToString("O")
|
|
}
|
|
};
|
|
|
|
var json = JsonSerializer.Serialize(collection);
|
|
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
|
|
|
// Tentar primeira abordagem (versão mais nova)
|
|
var response = await _httpClient.PostAsync("/api/v1/collections", content);
|
|
|
|
// Se falhar, tentar segunda abordagem (criar collection via get_or_create)
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
_logger.LogWarning("Método POST falhou, tentando abordagem alternativa");
|
|
|
|
// Criar usando get_or_create approach
|
|
var createPayload = new
|
|
{
|
|
name = _collectionName,
|
|
metadata = new
|
|
{
|
|
description = "RAG Collection",
|
|
created_at = DateTime.UtcNow.ToString("O")
|
|
},
|
|
get_or_create = true
|
|
};
|
|
|
|
var createJson = JsonSerializer.Serialize(createPayload);
|
|
var createContent = new StringContent(createJson, Encoding.UTF8, "application/json");
|
|
|
|
var createResponse = await _httpClient.PostAsync("/api/v1/collections", createContent);
|
|
|
|
if (!createResponse.IsSuccessStatusCode)
|
|
{
|
|
var error = await createResponse.Content.ReadAsStringAsync();
|
|
_logger.LogError("Erro ao criar collection: {Error}", error);
|
|
|
|
// Última tentativa: assumir que collection já existe
|
|
_logger.LogWarning("Assumindo que collection {CollectionName} já existe", _collectionName);
|
|
return;
|
|
}
|
|
}
|
|
|
|
_logger.LogInformation("Collection {CollectionName} criada/verificada com sucesso", _collectionName);
|
|
}
|
|
|
|
private object? BuildWhereClause(string? projectId, Dictionary<string, object>? filters)
|
|
{
|
|
var where = new Dictionary<string, object>();
|
|
|
|
if (!string.IsNullOrEmpty(projectId))
|
|
{
|
|
where["project_id"] = projectId;
|
|
}
|
|
|
|
if (filters != null)
|
|
{
|
|
foreach (var filter in filters)
|
|
{
|
|
where[filter.Key] = filter.Value;
|
|
}
|
|
}
|
|
|
|
return where.Any() ? where : null;
|
|
}
|
|
|
|
private List<VectorSearchResult> ParseQueryResults(ChromaQueryResult? queryResult, double threshold)
|
|
{
|
|
var results = new List<VectorSearchResult>();
|
|
|
|
if (queryResult?.documents?.Length > 0 && queryResult.documents[0].Length > 0)
|
|
{
|
|
for (int i = 0; i < queryResult.documents[0].Length; i++)
|
|
{
|
|
var distance = queryResult.distances?[0][i] ?? 1.0;
|
|
|
|
// Chroma retorna distâncias, converter para similaridade (1 - distance)
|
|
var similarity = 1.0 - distance;
|
|
|
|
if (similarity >= threshold)
|
|
{
|
|
results.Add(new VectorSearchResult
|
|
{
|
|
Id = queryResult.ids[0][i],
|
|
Content = queryResult.documents[0][i],
|
|
Score = similarity,
|
|
Metadata = queryResult.metadatas?[0][i]
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return results.OrderByDescending(r => r.Score).ToList();
|
|
}
|
|
}
|
|
|
|
// ========================================
|
|
// DTOs PARA CHROMA API
|
|
// ========================================
|
|
|
|
public class ChromaQueryResult
|
|
{
|
|
public string[][] ids { get; set; } = Array.Empty<string[]>();
|
|
public string[][] documents { get; set; } = Array.Empty<string[]>();
|
|
public double[][]? distances { get; set; }
|
|
public Dictionary<string, object>[][]? metadatas { get; set; }
|
|
}
|
|
|
|
public class ChromaGetResult
|
|
{
|
|
public string[] ids { get; set; } = Array.Empty<string>();
|
|
public string[] documents { get; set; } = Array.Empty<string>();
|
|
public Dictionary<string, object>[]? metadatas { get; set; }
|
|
}
|
|
|
|
public class ChromaCountResult
|
|
{
|
|
public int count { get; set; }
|
|
}
|
|
} |