Convert PDF to plain text
The following code sample shows how to convert the collection of glyphs on a PDF page to a text string. The algorithm detects spaces, line breaks and overlapping glyphs for visual effects.
Code sample to convert PDF to plain text
using (FileStream fileIn = new FileStream(@"..\..\..\inputdocuments/sometext.pdf", FileMode.Open, FileAccess.Read))
{
Document document = new Document(fileIn);
//get the first page
Page page = document.Pages[0];
//retrieve all glyphs from the current page
//Notice that you grep a strong reference to the glyphs, otherwise the GC can decide to recycle.
GlyphCollection glyphs = page.Glyphs;
//default the glyph collection is ordered as they are present in the PDF file.
//we want them in reading order.
glyphs.Sort();
using (FileStream fileOut = new FileStream(@"..\..\extractedText.txt", FileMode.Create, FileAccess.Write))
{
StreamWriter writer = new StreamWriter(fileOut);
Glyph previousGlyph = null;
foreach (Glyph glyph in glyphs)
{
int spaces = CheckSpaces(previousGlyph, glyph);
for (int i = 0; i < spaces; i++)
{
//insert a space.
writer.Write(" ");
}
if (spaces == -1)
{
//insert an enter.
writer.WriteLine();
}
//insert the characters
foreach (char ch in glyph.Characters)
{
writer.Write(ch);
}
previousGlyph = glyph;
}
writer.Flush();
}
}
Using fileIn As New FileStream("..\..\..\inputdocuments/sometext.pdf", FileMode.Open, FileAccess.Read)
Dim document As New Document(fileIn)
'get the first page
Dim page As Page = document.Pages(0)
'retrieve all glyphs from the current page
'Notice that you grep a strong reference to the glyphs, otherwise the GC can decide to recycle.
Dim glyphs As GlyphCollection = page.Glyphs
'default the glyph collection is ordered as they are present in the PDF file.
'we want them in reading order.
glyphs.Sort()
Using fileOut As New FileStream("..\..\extractedText.txt", FileMode.Create, FileAccess.Write)
Dim writer As New StreamWriter(fileOut)
Dim previousGlyph As Glyph = Nothing
For Each glyph As Glyph In glyphs
Dim spaces As Integer = CheckSpaces(previousGlyph, glyph)
For i As Integer = 0 To spaces - 1
'insert a space.
writer.Write(" ")
Next
If spaces = -1 Then
'insert an enter.
writer.WriteLine()
End If
'insert the characters
For Each ch As Char In glyph.Characters
writer.Write(ch)
Next
previousGlyph = glyph
Next
writer.Flush()
End Using
End Using
//sometimes PDF files don't contain space characters, in this case words are not seperated like so: "word1 word2"
//but you have two Strings "word1" and "word2", where word2 is simply placed further away to simulate a " ".
//to account for this, we must check the positions of each Glyph which is why this function is necessary.
static int CheckSpaces(Glyph firstGlyph, Glyph secondGlyph)
{
if (firstGlyph == null)
{
//there is only 1 glyph to compare.
return 0;
}
if (firstGlyph.BottomLeft.Y != secondGlyph.BottomLeft.Y)
{
//they are not on the same line. (-1 will converted in an enter)
return -1;
}
double spaceBetween = secondGlyph.BottomLeft.X - firstGlyph.BottomRight.X;
if (spaceBetween < 0.1)
{
//[almost] overlapping text.
return 0;
}
double spaceLength = firstGlyph.Font.CalculateWidth(" ", firstGlyph.FontSize);
double spaces = spaceBetween / spaceLength;
return (int)Math.Round(spaces);
}
'sometimes PDF files don't contain space characters, in this case words are not seperated like so: "word1 word2"
'but you have two Strings "word1" and "word2", where word2 is simply placed further away to simulate a " ".
'to account for this, we must check the positions of each Glyph which is why this function is necessary.
Private Function CheckSpaces(firstGlyph As Glyph, secondGlyph As Glyph) As Integer
If firstGlyph Is Nothing Then
'there is only 1 glyph to compare.
Return 0
End If
If firstGlyph.BottomLeft.Y <> secondGlyph.BottomLeft.Y Then
'they are not on the same line. (-1 will converted in an enter)
Return -1
End If
Dim spaceBetween As Double = secondGlyph.BottomLeft.X - firstGlyph.BottomRight.X
If spaceBetween < 0.1 Then
'[almost] overlapping text.
Return 0
End If
Dim spaceLength As Double = firstGlyph.Font.CalculateWidth(" ", firstGlyph.FontSize)
Dim spaces As Double = spaceBetween / spaceLength
Return CInt(Math.Round(spaces))
End Function
No results
Add Long Term Validation (LTV) data to an existing signature
Render PDF to multipage color TIFF
Render PDF page to Skia surface
Render PDF page as PNG
How to downscale all images in a PDF
How to generate and export certificates
How to downscale all images in a PDF
Add Stamp to PDF
How to use a system font for rendering text
Customize the GUI interaction of a radio button
Customize the UI interaction of a check box
PDF to grayscale TIFF
How to reduce PDF file size
How do I create graphics with Icc based colors
Highlight fields in PDF
Add a note to PDF
Display PDF in a WPF app and stay responsive – the code
Draw interactively on a PDF page
Resize PDF pages
Verify a custom digital PDF signature
C# Print PDF documents from a WPF application
Extract glyph boxes from PDF
Use TrueType font collections
Layout text with MultilineTextShape
Calculate the height of a paragraph in PDF
Merge PDF files in C# .NET
How do I extract page destinations from bookmarks?
Clip PDF page content in C#
How do I use PDFControls.NET in a WPF application
Fill PDF form
Extract glyphs and sort by reading order
Add bookmarks to PDF
How to scale content of PDF
Create rectangles with rounded corners
Create text with decorations
Create layers in PDF and draw on each layer
Multipage TIFF to PDF
TIFF to PDF C#
Crop content on a PDF page
How to embed files in a PDF document
Remove graphics from PDF
Change the color inside a PDF
Create PDF in C#
Text formatting
Import FDF into PDF
Flatten PDF form
Digitally sign a PDF form in C# or VB.NET
Vector graphics in PDF
Translate PDF page content
Extract graphics from PDF
Determine the content bounding box
How to add page numbers to your PDF
Create / impose PDF 2-up
Search text in PDF
Append multiple PDF documents
Convert PDF to plain text
Flatten Markup Annotation
Add text field to PDF
Extract embedded files from PDF
Extract images from PDF
Add a Diagonal Watermark to PDF in C#
Fit image to PDF page
Add simple html text to PDF
Add multiline text to a PDF document
Add single-line text to PDF
Create a new digitally signed PDF document
PDF Viewer Preferences
Change page orientation PDF
Split PDF pages in C# and VB.NET
Append two or more existing PDF files
Change colors of black-and-white TIFF after converting from PDF
Determine if a PDF only contains images
Add footer to PDF
Convert SVG to PDF
Fill in a PDF form using MVC
C# render pdf in browser using MVC
Convert XHTML to PDF
Add hyperlink to PDF
Rotate a PDF page
Change the formatting of a numeric field
How to mirror PDF pages and other shapes
Fill in a template PDF document
How to add autosized text to PDF
Create formfields in PDF documents
Export FDF from PDF form
Add a link with an internal destination to PDF
Remove PDF security settings
Add a link to PDF with an external destination
How to sign and verify updates to a PDF document
Convert PDF to PNG using WPF
Embed TrueType font
Override MouseWheel event
Convert PDF to an image using a dither matrix
Font mapping
Convert PDF with layers to image
C# Print PDF Document
Render PDF with ResolveFont event handler
Render PDF to EMF
Convert PDF to XPS
How to create a thumbnail viewer
How to create a tiling for shapes in PDF
Add footer with left and right aligned text on same line
Convert PDF to JPG in C#
EMF to PDF as vector image
EMF to PDF as raster image
Replace field with image
Add a rubber stamp annotation with a custom icon
Create a text annotation in PDF with rich text
XhtmlParagraph and TrueType fonts
What is the resulting fontsize in PDF for rich text used in a SimpleXhtmlShape
Read and write meta data from PDF
Create a custom signature handler to sign and verify PDF documents
Merge PDF
Stitch PDF documents
Download and convert image to PDF
Convert TXT to PDF
Add barcodes to PDF
Convert PDF to multipage TIFF in C# .NET
Convert multiple PDF pages to bitmap
Render a PDF to bitmap
Bulleted list from XML and XSL
Tagged PDF
PDFKit.NET 5.0 – detailed changes to the API
Create tagged PDF
PDFKit.NET 5.0 and .NET Core
PDFKit.NET 5.0 and Xamarin
Dynamic XFA
PDFKit.NET 5.0 .NET Standard API
.NET Core console app on MacOS
Add tags to existing PDF
Read PDF tags
Merge XDP data with dynamic XFA form
Fill XFA form and export XDP data
Fill and save dynamic XFA form
Use PDFKit.NET 5.0 with a Xamarin.Forms app
Use multiple licenses
Licensing and .NET Standard
Reduce PDF size
Generate PDF form from XML
Generate PDF with local images from XML with Xamarin.iOS
Disable submit button after submitting
Write Document to HttpResponse