ChatRAG/Services/TextServices/QdrantTextDataService.cs
2025-06-21 14:20:07 -03:00

674 lines
25 KiB
C#

#pragma warning disable SKEXP0001
using ChatRAG.Contracts.VectorSearch;
using ChatRAG.Data;
using ChatRAG.Models;
using ChatRAG.Services.Contracts;
using Microsoft.SemanticKernel.Embeddings;
using System.Text;
using System.Collections.Concurrent;
namespace ChatRAG.Services.TextServices
{
public class QdrantTextDataService : ITextDataService
{
private readonly IVectorSearchService _vectorSearchService;
private readonly ITextEmbeddingGenerationService _embeddingService;
private readonly ILogger<QdrantTextDataService> _logger;
// Cache para project IDs para evitar buscas custosas
private readonly ConcurrentDictionary<string, DateTime> _projectIdCache = new();
private readonly TimeSpan _cacheTimeout = TimeSpan.FromMinutes(5);
public QdrantTextDataService(
IVectorSearchService vectorSearchService,
ITextEmbeddingGenerationService embeddingService,
ILogger<QdrantTextDataService> logger)
{
_vectorSearchService = vectorSearchService;
_embeddingService = embeddingService;
_logger = logger;
}
public string ProviderName => "Qdrant";
// ========================================
// MÉTODOS ORIGINAIS (compatibilidade com MongoDB)
// ========================================
public async Task SalvarNoMongoDB(string titulo, string texto, string projectId)
{
await SalvarNoMongoDB(null, titulo, texto, projectId);
}
public async Task SalvarNoMongoDB(string? id, string titulo, string texto, string projectId)
{
try
{
var conteudo = $"**{titulo}** \n\n {texto}";
// Gera embedding uma única vez
var embedding = await GenerateEmbeddingOptimized(conteudo);
if (string.IsNullOrEmpty(id))
{
// Cria novo documento
var newId = await _vectorSearchService.AddDocumentAsync(titulo, texto, projectId, embedding);
// Atualiza cache de project IDs
_projectIdCache.TryAdd(projectId, DateTime.UtcNow);
_logger.LogDebug("Documento '{Title}' criado no Qdrant com ID {Id}", titulo, newId);
}
else
{
// Atualiza documento existente
await _vectorSearchService.UpdateDocumentAsync(id, titulo, texto, projectId, embedding);
_logger.LogDebug("Documento '{Id}' atualizado no Qdrant", id);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao salvar documento '{Title}' no Qdrant", titulo);
throw;
}
}
public async Task SalvarTextoComEmbeddingNoMongoDB(string textoCompleto, string projectId)
{
try
{
var textoArray = ParseTextIntoSections(textoCompleto);
// Processa seções em paralelo com limite de concorrência
var semaphore = new SemaphoreSlim(5, 5); // Máximo 5 operações simultâneas
var tasks = textoArray.Select(async item =>
{
await semaphore.WaitAsync();
try
{
var lines = item.Split('\n', 2);
var title = lines[0].Replace("**", "").Replace("\r", "").Trim();
var content = lines.Length > 1 ? lines[1] : "";
await SalvarNoMongoDB(title, content, projectId);
}
finally
{
semaphore.Release();
}
});
await Task.WhenAll(tasks);
semaphore.Dispose();
_logger.LogInformation("Texto completo processado: {SectionCount} seções salvas no Qdrant", textoArray.Count);
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao processar texto completo no Qdrant");
throw;
}
}
public async Task<IEnumerable<TextoComEmbedding>> GetAll()
{
try
{
// Usa cache de project IDs quando possível
var projectIds = await GetAllProjectIdsOptimized();
if (!projectIds.Any())
{
return Enumerable.Empty<TextoComEmbedding>();
}
var allDocuments = new List<VectorSearchResult>();
// Busca documentos em paralelo por projeto
var semaphore = new SemaphoreSlim(3, 3); // Máximo 3 projetos simultâneos
var tasks = projectIds.Select(async projectId =>
{
await semaphore.WaitAsync();
try
{
return await _vectorSearchService.GetDocumentsByProjectAsync(projectId);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Erro ao buscar documentos do projeto {ProjectId}", projectId);
return new List<VectorSearchResult>();
}
finally
{
semaphore.Release();
}
});
var results = await Task.WhenAll(tasks);
semaphore.Dispose();
foreach (var projectDocs in results)
{
allDocuments.AddRange(projectDocs);
}
return allDocuments.Select(ConvertToTextoComEmbedding);
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao recuperar todos os documentos do Qdrant");
throw;
}
}
public async Task<IEnumerable<TextoComEmbedding>> GetByPorjectId(string projectId)
{
try
{
var documents = await _vectorSearchService.GetDocumentsByProjectAsync(projectId);
return documents.Select(ConvertToTextoComEmbedding);
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao recuperar documentos do projeto {ProjectId} no Qdrant", projectId);
throw;
}
}
public async Task<TextoComEmbedding> GetById(string id)
{
try
{
var document = await _vectorSearchService.GetDocumentAsync(id);
if (document == null)
{
throw new ArgumentException($"Documento {id} não encontrado no Qdrant");
}
return ConvertToTextoComEmbedding(document);
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao recuperar documento {Id} do Qdrant", id);
throw;
}
}
// ========================================
// MÉTODOS NOVOS DA INTERFACE
// ========================================
public async Task<string> SaveDocumentAsync(DocumentInput document)
{
try
{
var embedding = await GenerateEmbeddingOptimized($"**{document.Title}** \n\n {document.Content}");
string id;
if (!string.IsNullOrEmpty(document.Id))
{
// Atualizar documento existente
await _vectorSearchService.UpdateDocumentAsync(
document.Id,
document.Title,
document.Content,
document.ProjectId,
embedding,
document.Metadata);
id = document.Id;
}
else
{
// Criar novo documento
id = await _vectorSearchService.AddDocumentAsync(
document.Title,
document.Content,
document.ProjectId,
embedding,
document.Metadata);
}
// Atualiza cache de project IDs
_projectIdCache.TryAdd(document.ProjectId, DateTime.UtcNow);
_logger.LogDebug("Documento {Id} salvo no Qdrant via SaveDocumentAsync", id);
return id;
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao salvar documento no Qdrant");
throw;
}
}
public async Task UpdateDocumentAsync(string id, DocumentInput document)
{
try
{
var embedding = await GenerateEmbeddingOptimized($"**{document.Title}** \n\n {document.Content}");
await _vectorSearchService.UpdateDocumentAsync(
id,
document.Title,
document.Content,
document.ProjectId,
embedding,
document.Metadata);
_logger.LogDebug("Documento {Id} atualizado no Qdrant", id);
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao atualizar documento {Id} no Qdrant", id);
throw;
}
}
public async Task DeleteDocumentAsync(string id)
{
try
{
await _vectorSearchService.DeleteDocumentAsync(id);
_logger.LogDebug("Documento {Id} deletado do Qdrant", id);
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao deletar documento {Id} do Qdrant", id);
throw;
}
}
public async Task<bool> DocumentExistsAsync(string id)
{
try
{
return await _vectorSearchService.DocumentExistsAsync(id);
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao verificar existência do documento {Id} no Qdrant", id);
return false;
}
}
public async Task<DocumentOutput?> GetDocumentAsync(string id)
{
try
{
var result = await _vectorSearchService.GetDocumentAsync(id);
if (result == null) return null;
return new DocumentOutput
{
Id = result.Id,
Title = result.Title,
Content = result.Content,
ProjectId = result.ProjectId,
Embedding = result.Embedding,
CreatedAt = result.CreatedAt,
UpdatedAt = result.UpdatedAt,
Metadata = result.Metadata
};
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao recuperar documento {Id} do Qdrant", id);
return null;
}
}
public async Task<List<DocumentOutput>> GetDocumentsByProjectAsync(string projectId)
{
try
{
var results = await _vectorSearchService.GetDocumentsByProjectAsync(projectId);
return results.Select(result => new DocumentOutput
{
Id = result.Id,
Title = result.Title,
Content = result.Content,
ProjectId = result.ProjectId,
Embedding = result.Embedding,
CreatedAt = result.CreatedAt,
UpdatedAt = result.UpdatedAt,
Metadata = result.Metadata
}).ToList();
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao recuperar documentos do projeto {ProjectId} do Qdrant", projectId);
throw;
}
}
public async Task<int> GetDocumentCountAsync(string? projectId = null)
{
try
{
return await _vectorSearchService.GetDocumentCountAsync(projectId);
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao contar documentos no Qdrant");
return 0;
}
}
// ========================================
// OPERAÇÕES EM LOTE OTIMIZADAS
// ========================================
public async Task<List<string>> SaveDocumentsBatchAsync(List<DocumentInput> documents)
{
var ids = new List<string>();
var errors = new List<Exception>();
// Agrupa documentos por projeto para otimizar embeddings
var documentsByProject = documents.GroupBy(d => d.ProjectId).ToList();
foreach (var projectGroup in documentsByProject)
{
var projectDocs = projectGroup.ToList();
// Processa em lotes menores dentro do projeto
var batchSize = 5; // Reduzido para evitar timeout
for (int i = 0; i < projectDocs.Count; i += batchSize)
{
var batch = projectDocs.Skip(i).Take(batchSize);
// Gera embeddings em paralelo para o lote
var embeddingTasks = batch.Select(async doc =>
{
try
{
var embedding = await GenerateEmbeddingOptimized($"**{doc.Title}** \n\n {doc.Content}");
return new { Document = doc, Embedding = embedding, Error = (Exception?)null };
}
catch (Exception ex)
{
return new { Document = doc, Embedding = (double[]?)null, Error = ex };
}
});
var embeddingResults = await Task.WhenAll(embeddingTasks);
// Salva documentos com embeddings gerados
var saveTasks = embeddingResults.Select(async result =>
{
if (result.Error != null)
{
errors.Add(result.Error);
return null;
}
try
{
string id;
if (!string.IsNullOrEmpty(result.Document.Id))
{
await _vectorSearchService.UpdateDocumentAsync(
result.Document.Id,
result.Document.Title,
result.Document.Content,
result.Document.ProjectId,
result.Embedding!,
result.Document.Metadata);
id = result.Document.Id;
}
else
{
id = await _vectorSearchService.AddDocumentAsync(
result.Document.Title,
result.Document.Content,
result.Document.ProjectId,
result.Embedding!,
result.Document.Metadata);
}
return id;
}
catch (Exception ex)
{
errors.Add(ex);
_logger.LogError(ex, "Erro ao salvar documento '{Title}' em lote", result.Document.Title);
return null;
}
});
var batchResults = await Task.WhenAll(saveTasks);
ids.AddRange(batchResults.Where(id => id != null)!);
}
// Atualiza cache para o projeto
_projectIdCache.TryAdd(projectGroup.Key, DateTime.UtcNow);
}
if (errors.Any())
{
_logger.LogWarning("Batch save completado com {ErrorCount} erros de {TotalCount} documentos",
errors.Count, documents.Count);
}
_logger.LogInformation("Batch save: {SuccessCount}/{TotalCount} documentos salvos no Qdrant",
ids.Count, documents.Count);
return ids;
}
public async Task DeleteDocumentsBatchAsync(List<string> ids)
{
var errors = new List<Exception>();
// Processa em lotes pequenos para não sobrecarregar
var batchSize = 10; // Reduzido para melhor estabilidade
for (int i = 0; i < ids.Count; i += batchSize)
{
var batch = ids.Skip(i).Take(batchSize);
var tasks = batch.Select(async id =>
{
try
{
await _vectorSearchService.DeleteDocumentAsync(id);
return true;
}
catch (Exception ex)
{
errors.Add(ex);
_logger.LogError(ex, "Erro ao deletar documento {Id} em lote", id);
return false;
}
});
await Task.WhenAll(tasks);
}
if (errors.Any())
{
_logger.LogWarning("Batch delete completado com {ErrorCount} erros de {TotalCount} documentos",
errors.Count, ids.Count);
}
else
{
_logger.LogInformation("Batch delete: {TotalCount} documentos removidos do Qdrant", ids.Count);
}
}
// ========================================
// ESTATÍSTICAS DO PROVIDER
// ========================================
public async Task<Dictionary<string, object>> GetProviderStatsAsync()
{
try
{
var baseStats = await _vectorSearchService.GetStatsAsync();
var totalDocs = await GetDocumentCountAsync();
// Usa cache para project IDs
var projectIds = await GetAllProjectIdsOptimized();
var projectStats = new Dictionary<string, int>();
// Busca contadores em paralelo
var countTasks = projectIds.Select(async projectId =>
{
try
{
var count = await GetDocumentCountAsync(projectId);
return new { ProjectId = projectId, Count = count };
}
catch
{
return new { ProjectId = projectId, Count = 0 };
}
});
var countResults = await Task.WhenAll(countTasks);
foreach (var result in countResults)
{
projectStats[result.ProjectId] = result.Count;
}
var enhancedStats = new Dictionary<string, object>(baseStats)
{
["text_service_provider"] = "Qdrant",
["total_documents_via_text_service"] = totalDocs,
["projects_count"] = projectIds.Count,
["documents_by_project"] = projectStats,
["supports_batch_operations"] = true,
["supports_metadata"] = true,
["embedding_auto_generation"] = true,
["cache_enabled"] = true,
["cached_project_ids"] = _projectIdCache.Count
};
return enhancedStats;
}
catch (Exception ex)
{
return new Dictionary<string, object>
{
["provider"] = "Qdrant",
["text_service_provider"] = "Qdrant",
["health"] = "error",
["error"] = ex.Message,
["last_check"] = DateTime.UtcNow
};
}
}
// ========================================
// MÉTODOS AUXILIARES PRIVADOS OTIMIZADOS
// ========================================
private async Task<double[]> GenerateEmbeddingOptimized(string content)
{
var embedding = await _embeddingService.GenerateEmbeddingAsync(content);
return embedding.ToArray().Select(e => (double)e).ToArray();
}
private static List<string> ParseTextIntoSections(string textoCompleto)
{
var textoArray = new List<string>();
string[] textolinhas = textoCompleto.Split(new string[] { "\n" }, StringSplitOptions.None);
var title = textolinhas[0];
var builder = new StringBuilder();
foreach (string line in textolinhas)
{
if (line.StartsWith("**") || line.StartsWith("\r**"))
{
if (builder.Length > 0)
{
textoArray.Add(title.Replace("**", "").Replace("\r", "") + ": " + Environment.NewLine + builder.ToString());
builder = new StringBuilder();
title = line;
}
}
else
{
builder.AppendLine(line);
}
}
// Adiciona último bloco se houver
if (builder.Length > 0)
{
textoArray.Add(title.Replace("**", "").Replace("\r", "") + ": " + Environment.NewLine + builder.ToString());
}
return textoArray;
}
private async Task<List<string>> GetAllProjectIdsOptimized()
{
// Remove entradas expiradas do cache
var now = DateTime.UtcNow;
var expiredKeys = _projectIdCache
.Where(kvp => now - kvp.Value > _cacheTimeout)
.Select(kvp => kvp.Key)
.ToList();
foreach (var key in expiredKeys)
{
_projectIdCache.TryRemove(key, out _);
}
// Se temos dados no cache e não estão muito antigos, usa o cache
if (_projectIdCache.Any())
{
return _projectIdCache.Keys.ToList();
}
// Caso contrário, busca no Qdrant
try
{
// Esta busca é custosa, mas só será executada quando o cache estiver vazio
var allResults = await _vectorSearchService.SearchSimilarAsync(
new double[384], // Vector dummy menor
projectId: null,
threshold: 0.0,
limit: 1000); // Limit menor para melhor performance
var projectIds = allResults
.Select(r => r.ProjectId)
.Where(pid => !string.IsNullOrEmpty(pid))
.Distinct()
.ToList();
// Atualiza cache
foreach (var projectId in projectIds)
{
_projectIdCache.TryAdd(projectId, now);
}
return projectIds;
}
catch (Exception ex)
{
_logger.LogError(ex, "Erro ao recuperar IDs de projetos do Qdrant");
return new List<string>();
}
}
private static TextoComEmbedding ConvertToTextoComEmbedding(VectorSearchResult result)
{
return new TextoComEmbedding
{
Id = result.Id,
Titulo = result.Title,
Conteudo = result.Content,
ProjetoId = result.ProjectId,
Embedding = result.Embedding,
// Campos que podem não existir no Qdrant
ProjetoNome = result.Metadata?.GetValueOrDefault("project_name")?.ToString() ?? "",
TipoDocumento = result.Metadata?.GetValueOrDefault("document_type")?.ToString() ?? "",
Categoria = result.Metadata?.GetValueOrDefault("category")?.ToString() ?? "",
Tags = result.Metadata?.GetValueOrDefault("tags") as string[] ?? Array.Empty<string>()
};
}
}
}
#pragma warning restore SKEXP0001