Convert-it/Areas/DocumentConverters/Controllers/PdfToTextController.cs

319 lines
13 KiB
C#

using System.IO;
using System.Text;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Extensions.Localization;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using iText.Kernel.Exceptions;
namespace Convert_It_Online.Areas.DocumentConverters.Controllers
{
[Area("DocumentConverters")]
public class PdfToTextController : Controller
{
private readonly IStringLocalizer<SharedResource> _localizer;
private readonly ILogger<PdfToTextController> _logger;
private const long MaxPreviewSize = 10 * 1024 * 1024; // 10MB
public PdfToTextController(IStringLocalizer<SharedResource> localizer, ILogger<PdfToTextController> logger)
{
_localizer = localizer;
_logger = logger;
}
private void SetCommonViewBagProperties()
{
ViewBag.HomeLink = _localizer["HomeLink"];
ViewBag.TextMenuTitle = _localizer["TextMenuTitle"];
ViewBag.ImageMenuTitle = _localizer["ImageMenuTitle"];
ViewBag.DocumentMenuTitle = _localizer["DocumentMenuTitle"];
ViewBag.CaseConverterTitle = _localizer["CaseConverterTitle"];
ViewBag.JpgToWebpTitle = _localizer["JpgToWebpTitle"];
ViewBag.HeicToJpgTitle = _localizer["HeicToJpgTitle"];
ViewBag.PdfToTextTitle = _localizer["PdfToTextTitle"];
ViewBag.PdfBarcodeTitle = _localizer["PdfBarcodeTitle"];
ViewBag.FooterText = _localizer["FooterText"];
ViewBag.About = _localizer["About"];
ViewBag.Contact = _localizer["Contact"];
ViewBag.Terms = _localizer["Terms"];
}
private void PrepareIndexView()
{
SetCommonViewBagProperties();
ViewBag.PageTitle = _localizer["PdfTextConverterPageTitle"];
ViewBag.PageDescription = _localizer["PdfTextConverterPageDescription"];
ViewBag.PdfPlainTextTabTitle = _localizer["PdfPlainTextTabTitle"];
ViewBag.PdfMarkdownTabTitle = _localizer["PdfMarkdownTabTitle"];
ViewBag.PdfFileInputLabel = _localizer["PdfFileInputLabel"];
ViewBag.PdfPasswordLabel = _localizer["PdfPasswordLabel"];
ViewBag.PdfPasswordPlaceholder = _localizer["PdfPasswordPlaceholder"];
ViewBag.PdfPasswordHint = _localizer["PdfPasswordHint"];
ViewBag.ExtractPlainTextButton = _localizer["ExtractPlainTextButton"];
ViewBag.ExtractMarkdownButton = _localizer["ExtractMarkdownButton"];
ViewBag.DownloadPlainTextButton = _localizer["DownloadPlainTextButton"];
ViewBag.DownloadMarkdownButton = _localizer["DownloadMarkdownButton"];
ViewBag.PdfTextPreviewTitle = _localizer["PdfTextPreviewTitle"];
ViewBag.SelectFileError = _localizer["SelectFileError"];
ViewBag.FaqWhatTitle = _localizer["PdfTextFaqWhatTitle"];
ViewBag.FaqWhatContent = _localizer["PdfTextFaqWhatContent"];
ViewBag.FaqHowTitle = _localizer["PdfTextFaqHowTitle"];
ViewBag.FaqHowContent = _localizer["PdfTextFaqHowContent"];
ViewBag.FaqWhyTitle = _localizer["PdfTextFaqWhyTitle"];
ViewBag.FaqWhyContent = _localizer["PdfTextFaqWhyContent"];
ViewBag.FaqSecurityTitle = _localizer["PdfTextFaqSecurityTitle"];
ViewBag.FaqSecurityContent = _localizer["PdfTextFaqSecurityContent"];
ViewBag.FaqLimitsTitle = _localizer["PdfTextFaqLimitsTitle"];
ViewBag.FaqLimitsContent = _localizer["PdfTextFaqLimitsContent"];
ViewBag.MetaDescription = ViewBag.PageDescription;
}
public IActionResult Index()
{
PrepareIndexView();
return View();
}
[HttpGet]
public IActionResult Test()
{
return Json(new { success = true, message = "Roteamento funcionando!", timestamp = DateTime.Now });
}
[HttpPost]
public async Task<IActionResult> ExtractPlainText(IFormFile pdfFile, string? password, bool preview = false)
{
return await HandleExtraction(pdfFile, password, preview, toMarkdown: false);
}
[HttpPost]
public async Task<IActionResult> ExtractMarkdown(IFormFile pdfFile, string? password, bool preview = false)
{
return await HandleExtraction(pdfFile, password, preview, toMarkdown: true);
}
private async Task<IActionResult> HandleExtraction(IFormFile? pdfFile, string? password, bool preview, bool toMarkdown)
{
if (pdfFile == null || pdfFile.Length == 0)
{
_logger.LogWarning("[PDF-TEXT] Attempt without file");
if (preview)
{
return Json(new { success = false, message = _localizer["SelectFileError"].Value });
}
ModelState.AddModelError("pdfFile", _localizer["SelectFileError"]);
PrepareIndexView();
return View("Index");
}
if (!IsValidPdf(pdfFile))
{
_logger.LogWarning("[PDF-TEXT] Invalid file type: {ContentType}", pdfFile.ContentType);
if (preview)
{
return Json(new { success = false, message = _localizer["InvalidPdfFileError"].Value });
}
ModelState.AddModelError("pdfFile", _localizer["InvalidPdfFileError"]);
PrepareIndexView();
return View("Index");
}
if (preview && pdfFile.Length > MaxPreviewSize)
{
if (preview)
{
return Json(new { success = false, message = _localizer["PdfPreviewTooLarge"].Value });
}
}
var extraction = await TryExtractTextAsync(pdfFile, password);
if (!extraction.Success)
{
var message = _localizer[extraction.ErrorKey].Value;
if (preview)
{
return Json(new { success = false, message });
}
ModelState.AddModelError("pdfFile", _localizer[extraction.ErrorKey]);
ViewBag.ConversionError = message;
PrepareIndexView();
return View("Index");
}
var textContent = extraction.Content ?? string.Empty;
if (toMarkdown)
{
textContent = ToMarkdown(textContent);
}
var fileBaseName = Path.GetFileNameWithoutExtension(pdfFile.FileName);
var extension = toMarkdown ? ".md" : ".txt";
var downloadFileName = string.IsNullOrWhiteSpace(fileBaseName) ? (toMarkdown ? "resultado.md" : "resultado.txt") : fileBaseName + extension;
var contentType = toMarkdown ? "text/markdown" : "text/plain";
if (preview)
{
return Json(new
{
success = true,
content = textContent,
filename = downloadFileName,
format = toMarkdown ? "markdown" : "text"
});
}
var payload = Encoding.UTF8.GetBytes(textContent);
return File(payload, contentType, downloadFileName);
}
private async Task<(bool Success, string? Content, string ErrorKey)> TryExtractTextAsync(IFormFile pdfFile, string? password)
{
try
{
await using var memoryStream = new MemoryStream();
await pdfFile.CopyToAsync(memoryStream);
var pdfBytes = memoryStream.ToArray();
// Try multiple password encodings and extraction strategies
var passwords = new List<byte[]?> { null }; // Start with no password
if (!string.IsNullOrEmpty(password))
{
passwords.Add(System.Text.Encoding.UTF8.GetBytes(password));
passwords.Add(System.Text.Encoding.ASCII.GetBytes(password));
passwords.Add(System.Text.Encoding.Latin1.GetBytes(password));
}
foreach (var pwd in passwords)
{
try
{
ReaderProperties readerProperties = new ReaderProperties();
if (pwd != null)
{
readerProperties.SetPassword(pwd);
}
using var pdfReader = new PdfReader(new MemoryStream(pdfBytes), readerProperties);
using var pdfDocument = new PdfDocument(pdfReader);
var builder = new StringBuilder();
int numberOfPages = pdfDocument.GetNumberOfPages();
for (int i = 1; i <= numberOfPages; i++)
{
var page = pdfDocument.GetPage(i);
// Try multiple extraction strategies
var strategies = new ITextExtractionStrategy[]
{
new LocationTextExtractionStrategy(),
new SimpleTextExtractionStrategy()
};
string pageText = "";
foreach (var strategy in strategies)
{
try
{
pageText = PdfTextExtractor.GetTextFromPage(page, strategy);
if (!string.IsNullOrWhiteSpace(pageText))
break;
}
catch
{
continue;
}
}
if (!string.IsNullOrWhiteSpace(pageText))
{
builder.AppendLine(pageText);
builder.AppendLine();
}
else
{
builder.AppendLine($"[Página {i} - texto não detectado ou pode conter apenas imagens]");
builder.AppendLine();
}
}
var result = builder.ToString().Trim();
if (!string.IsNullOrWhiteSpace(result))
{
return (true, result, string.Empty);
}
}
catch (BadPasswordException) when (pwd != null)
{
continue; // Try next password encoding
}
catch (BadPasswordException) when (pwd == null && !string.IsNullOrEmpty(password))
{
// Document requires password but none worked
break;
}
}
// If we get here, either password was wrong or no text found
if (!string.IsNullOrEmpty(password))
{
return (false, null, "PdfInvalidPassword");
}
return (false, null, "InvalidPdfFileError");
}
catch (BadPasswordException)
{
return (false, null, string.IsNullOrEmpty(password) ? "PdfPasswordRequired" : "PdfInvalidPassword");
}
catch (Exception ex)
{
_logger.LogError(ex, "[PDF-TEXT] Failed to extract text from {FileName}", pdfFile.FileName);
return (false, null, "InvalidPdfFileError");
}
}
private static bool IsValidPdf(IFormFile file)
{
if (file == null)
{
return false;
}
var contentType = file.ContentType?.ToLowerInvariant();
if (contentType == "application/pdf" || contentType == "application/x-pdf")
{
return true;
}
return Path.GetExtension(file.FileName).Equals(".pdf", StringComparison.OrdinalIgnoreCase);
}
private static string ToMarkdown(string text)
{
var normalized = text.Replace("\r\n", "\n").Trim();
if (string.IsNullOrWhiteSpace(normalized))
{
return string.Empty;
}
var builder = new StringBuilder();
var lines = normalized.Split('\n');
foreach (var line in lines)
{
var trimmed = line.TrimEnd();
builder.AppendLine(trimmed);
}
return builder.ToString().Trim();
}
}
}