Hi There,
I have a usecase where we get a large pdf file (of size 700to 800mb) containg 400k pages and the task is split the file into individual pages. I am getting into out of memory exception after splitting files around 100k pages. Please suggest any alternatives or workaround. My code is below:
My code to split pdf file is below:
public int SplitPdfFileByPageDelimiter(string outputFilesDir, string pageDelimiterString, int extractStringLength) { if (string.IsNullOrEmpty(pageDelimiterString)) { throw new ArgumentNullException("pageDelimiterString", "[SplitPdfFileByPageDelimiter()]: delimiterString for spliting pdf file is null or empty."); } // open document var pdfDocument = new Aspose.Pdf.Document(this.InputFileStream); if (string.IsNullOrEmpty(outputFilesDir)) { throw new Exception("OutputFileDirectory missing"); } if (!Directory.Exists(outputFilesDir)) { Directory.CreateDirectory(outputFilesDir); } var outputFileName = Path.GetFileName(this.InputFilePath); if (string.IsNullOrEmpty(outputFileName)) { outputFileName = string.Format("{0}.pdf", Path.GetDirectoryName(outputFilesDir)); } var outputFileFormat = string.Concat(Path.GetFileNameWithoutExtension(outputFileName), "_{0}", Path.GetExtension(outputFileName)); var docCount = 1; var resetCount = 0; var document = new Aspose.Pdf.Document(); // loop through all the pages foreach (Page pdfPage in pdfDocument.Pages) { resetCount++; document.Pages.Add(pdfPage); var textFragmentAbsorber = new TextFragmentAbsorber(); pdfPage.Accept(textFragmentAbsorber); var textFragmentCollection = textFragmentAbsorber.TextFragments; foreach (TextFragment textFragment in textFragmentCollection) { var pdfFileUniqueIdIndex = textFragment.Text.IndexOf(pageDelimiterString); if (pdfFileUniqueIdIndex <= -1) { continue; } pdfFileUniqueIdIndex += pageDelimiterString.Length; var pdfFileUniqueId = extractStringLength > 0 ? textFragment.Text.Mid(pdfFileUniqueIdIndex, extractStringLength).Trim() : docCount.ToString(); if (string.IsNullOrWhiteSpace(pdfFileUniqueId)) { pdfFileUniqueId = string.Format("NoPdfFileUniqueId.{0}", Guid.NewGuid().ToString()); } var pdfFileFullName = Path.Combine(outputFilesDir, string.Format(outputFileFormat, pdfFileUniqueId)); if (!File.Exists(pdfFileFullName)) { document.Save(pdfFileFullName); } document.FreeMemory(); document.Dispose(); docCount += 1; document = new Aspose.Pdf.Document(); break; } pdfPage.FreeMemory(); // After every 200 pages let the process to sleep for couple of secs. if (resetCount >= 200) { pdfDocument.FreeMemory(); Console.WriteLine(string.Format("[SplitPdfFileByPageDelimiter()]: Going to sleep at document count : [{0}]", docCount)); Thread.Sleep(2000); resetCount = 0; } }