using System.IO; using System.Text; using Microsoft.AspNetCore.Mvc; using Microsoft.Extensions.Localization; using iText.Kernel.Pdf; using iText.Kernel.Pdf.Canvas.Parser; using iText.Kernel.Pdf.Canvas.Parser.Listener; using iText.Kernel.Exceptions; namespace Convert_It_Online.Areas.DocumentConverters.Controllers { [Area("DocumentConverters")] public class PdfToTextController : Controller { private readonly IStringLocalizer _localizer; private readonly ILogger _logger; private const long MaxPreviewSize = 10 * 1024 * 1024; // 10MB public PdfToTextController(IStringLocalizer localizer, ILogger logger) { _localizer = localizer; _logger = logger; } private void SetCommonViewBagProperties() { ViewBag.HomeLink = _localizer["HomeLink"]; ViewBag.TextMenuTitle = _localizer["TextMenuTitle"]; ViewBag.ImageMenuTitle = _localizer["ImageMenuTitle"]; ViewBag.DocumentMenuTitle = _localizer["DocumentMenuTitle"]; ViewBag.CaseConverterTitle = _localizer["CaseConverterTitle"]; ViewBag.JpgToWebpTitle = _localizer["JpgToWebpTitle"]; ViewBag.HeicToJpgTitle = _localizer["HeicToJpgTitle"]; ViewBag.PdfToTextTitle = _localizer["PdfToTextTitle"]; ViewBag.PdfBarcodeTitle = _localizer["PdfBarcodeTitle"]; ViewBag.FooterText = _localizer["FooterText"]; ViewBag.About = _localizer["About"]; ViewBag.Contact = _localizer["Contact"]; ViewBag.Terms = _localizer["Terms"]; } private void PrepareIndexView() { SetCommonViewBagProperties(); ViewBag.PageTitle = _localizer["PdfTextConverterPageTitle"]; ViewBag.PageDescription = _localizer["PdfTextConverterPageDescription"]; ViewBag.PdfPlainTextTabTitle = _localizer["PdfPlainTextTabTitle"]; ViewBag.PdfMarkdownTabTitle = _localizer["PdfMarkdownTabTitle"]; ViewBag.PdfFileInputLabel = _localizer["PdfFileInputLabel"]; ViewBag.PdfPasswordLabel = _localizer["PdfPasswordLabel"]; ViewBag.PdfPasswordPlaceholder = _localizer["PdfPasswordPlaceholder"]; ViewBag.PdfPasswordHint = _localizer["PdfPasswordHint"]; ViewBag.ExtractPlainTextButton = _localizer["ExtractPlainTextButton"]; ViewBag.ExtractMarkdownButton = _localizer["ExtractMarkdownButton"]; ViewBag.DownloadPlainTextButton = _localizer["DownloadPlainTextButton"]; ViewBag.DownloadMarkdownButton = _localizer["DownloadMarkdownButton"]; ViewBag.PdfTextPreviewTitle = _localizer["PdfTextPreviewTitle"]; ViewBag.SelectFileError = _localizer["SelectFileError"]; ViewBag.FaqWhatTitle = _localizer["PdfTextFaqWhatTitle"]; ViewBag.FaqWhatContent = _localizer["PdfTextFaqWhatContent"]; ViewBag.FaqHowTitle = _localizer["PdfTextFaqHowTitle"]; ViewBag.FaqHowContent = _localizer["PdfTextFaqHowContent"]; ViewBag.FaqWhyTitle = _localizer["PdfTextFaqWhyTitle"]; ViewBag.FaqWhyContent = _localizer["PdfTextFaqWhyContent"]; ViewBag.FaqSecurityTitle = _localizer["PdfTextFaqSecurityTitle"]; ViewBag.FaqSecurityContent = _localizer["PdfTextFaqSecurityContent"]; ViewBag.FaqLimitsTitle = _localizer["PdfTextFaqLimitsTitle"]; ViewBag.FaqLimitsContent = _localizer["PdfTextFaqLimitsContent"]; ViewBag.MetaDescription = ViewBag.PageDescription; } public IActionResult Index() { PrepareIndexView(); return View(); } [HttpGet] public IActionResult Test() { return Json(new { success = true, message = "Roteamento funcionando!", timestamp = DateTime.Now }); } [HttpPost] public async Task ExtractPlainText(IFormFile pdfFile, string? password, bool preview = false) { return await HandleExtraction(pdfFile, password, preview, toMarkdown: false); } [HttpPost] public async Task ExtractMarkdown(IFormFile pdfFile, string? password, bool preview = false) { return await HandleExtraction(pdfFile, password, preview, toMarkdown: true); } private async Task HandleExtraction(IFormFile? pdfFile, string? password, bool preview, bool toMarkdown) { if (pdfFile == null || pdfFile.Length == 0) { _logger.LogWarning("[PDF-TEXT] Attempt without file"); if (preview) { return Json(new { success = false, message = _localizer["SelectFileError"].Value }); } ModelState.AddModelError("pdfFile", _localizer["SelectFileError"]); PrepareIndexView(); return View("Index"); } if (!IsValidPdf(pdfFile)) { _logger.LogWarning("[PDF-TEXT] Invalid file type: {ContentType}", pdfFile.ContentType); if (preview) { return Json(new { success = false, message = _localizer["InvalidPdfFileError"].Value }); } ModelState.AddModelError("pdfFile", _localizer["InvalidPdfFileError"]); PrepareIndexView(); return View("Index"); } if (preview && pdfFile.Length > MaxPreviewSize) { if (preview) { return Json(new { success = false, message = _localizer["PdfPreviewTooLarge"].Value }); } } var extraction = await TryExtractTextAsync(pdfFile, password); if (!extraction.Success) { var message = _localizer[extraction.ErrorKey].Value; if (preview) { return Json(new { success = false, message }); } ModelState.AddModelError("pdfFile", _localizer[extraction.ErrorKey]); ViewBag.ConversionError = message; PrepareIndexView(); return View("Index"); } var textContent = extraction.Content ?? string.Empty; if (toMarkdown) { textContent = ToMarkdown(textContent); } var fileBaseName = Path.GetFileNameWithoutExtension(pdfFile.FileName); var extension = toMarkdown ? ".md" : ".txt"; var downloadFileName = string.IsNullOrWhiteSpace(fileBaseName) ? (toMarkdown ? "resultado.md" : "resultado.txt") : fileBaseName + extension; var contentType = toMarkdown ? "text/markdown" : "text/plain"; if (preview) { return Json(new { success = true, content = textContent, filename = downloadFileName, format = toMarkdown ? "markdown" : "text" }); } var payload = Encoding.UTF8.GetBytes(textContent); return File(payload, contentType, downloadFileName); } private async Task<(bool Success, string? Content, string ErrorKey)> TryExtractTextAsync(IFormFile pdfFile, string? password) { try { await using var memoryStream = new MemoryStream(); await pdfFile.CopyToAsync(memoryStream); var pdfBytes = memoryStream.ToArray(); // Try multiple password encodings and extraction strategies var passwords = new List { null }; // Start with no password if (!string.IsNullOrEmpty(password)) { passwords.Add(System.Text.Encoding.UTF8.GetBytes(password)); passwords.Add(System.Text.Encoding.ASCII.GetBytes(password)); passwords.Add(System.Text.Encoding.Latin1.GetBytes(password)); } foreach (var pwd in passwords) { try { ReaderProperties readerProperties = new ReaderProperties(); if (pwd != null) { readerProperties.SetPassword(pwd); } using var pdfReader = new PdfReader(new MemoryStream(pdfBytes), readerProperties); using var pdfDocument = new PdfDocument(pdfReader); var builder = new StringBuilder(); int numberOfPages = pdfDocument.GetNumberOfPages(); for (int i = 1; i <= numberOfPages; i++) { var page = pdfDocument.GetPage(i); // Try multiple extraction strategies var strategies = new ITextExtractionStrategy[] { new LocationTextExtractionStrategy(), new SimpleTextExtractionStrategy() }; string pageText = ""; foreach (var strategy in strategies) { try { pageText = PdfTextExtractor.GetTextFromPage(page, strategy); if (!string.IsNullOrWhiteSpace(pageText)) break; } catch { continue; } } if (!string.IsNullOrWhiteSpace(pageText)) { builder.AppendLine(pageText); builder.AppendLine(); } else { builder.AppendLine($"[Página {i} - texto não detectado ou pode conter apenas imagens]"); builder.AppendLine(); } } var result = builder.ToString().Trim(); if (!string.IsNullOrWhiteSpace(result)) { return (true, result, string.Empty); } } catch (BadPasswordException) when (pwd != null) { continue; // Try next password encoding } catch (BadPasswordException) when (pwd == null && !string.IsNullOrEmpty(password)) { // Document requires password but none worked break; } } // If we get here, either password was wrong or no text found if (!string.IsNullOrEmpty(password)) { return (false, null, "PdfInvalidPassword"); } return (false, null, "InvalidPdfFileError"); } catch (BadPasswordException) { return (false, null, string.IsNullOrEmpty(password) ? "PdfPasswordRequired" : "PdfInvalidPassword"); } catch (Exception ex) { _logger.LogError(ex, "[PDF-TEXT] Failed to extract text from {FileName}", pdfFile.FileName); return (false, null, "InvalidPdfFileError"); } } private static bool IsValidPdf(IFormFile file) { if (file == null) { return false; } var contentType = file.ContentType?.ToLowerInvariant(); if (contentType == "application/pdf" || contentType == "application/x-pdf") { return true; } return Path.GetExtension(file.FileName).Equals(".pdf", StringComparison.OrdinalIgnoreCase); } private static string ToMarkdown(string text) { var normalized = text.Replace("\r\n", "\n").Trim(); if (string.IsNullOrWhiteSpace(normalized)) { return string.Empty; } var builder = new StringBuilder(); var lines = normalized.Split('\n'); foreach (var line in lines) { var trimmed = line.TrimEnd(); builder.AppendLine(trimmed); } return builder.ToString().Trim(); } } }