Can we use windows built-in PDF renderer capabilities to extract text from a PDF?
See the question and my original answer on StackOverflowWindows.Data.Pdf
has indeed a text extraction feature (beyond its rendering feature). It's not totally undocumented, as it's based on Windows Search's IFilter interface as others saids.
Here is some self-sufficient code that allows you to extract all texts but also custom properties (such as author, title, dates, etc.) of any .PDF file. It uses a stream as inpout so you can search files but also in-memory PDFs, etc.
Example usage:
static void Main()
{
foreach (var file in Directory.EnumerateFiles(@"D:\allmybooks", "*.pdf", SearchOption.AllDirectories))
{
Console.WriteLine(file);
using (var stream = File.OpenRead(file))
{
foreach (var obj in PdfExtractor.ExtractObjects(stream))
{
Console.WriteLine(obj.Value);
}
}
}
}
Extractor utility:
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.InteropServices;
using System.Runtime.InteropServices.ComTypes;
using System.Text;
namespace PdfUtilities
{
public static class PdfExtractor
{
private static Guid _windowsDataPdfFilterClsid = new Guid("6c337b26-3e38-4f98-813b-fba18bab64f5");
// this wraps pure text and custom values (author, title, etc.)
public sealed class PdfObject
{
public object Value { get; internal set; } // is string if IsText or can be anything otherwise (datetime, etc.)
public bool IsText { get; internal set; } // this is pure text
public string PropertyName { get; internal set; } // System.Author, System.Title, etc.
public override string ToString()
{
if (IsText)
return Value?.ToString();
return PropertyName + ": " + Value;
}
}
public static IEnumerable<PdfObject> ExtractObjects(Stream stream, bool throwOnError = false)
{
if (stream == null)
throw new ArgumentNullException(nameof(stream));
var type = Type.GetTypeFromCLSID(_windowsDataPdfFilterClsid);
var filter = (IFilter)Activator.CreateInstance(type);
var init = (IInitializeWithStream)filter;
var chunkText = new StringBuilder(80000); // under LOH
var sb = new StringBuilder();
var mis = new ManagedIStream(stream);
var hr = init.Initialize(mis, 0);
if (hr < 0)
{
if (!throwOnError)
yield break;
Marshal.ThrowExceptionForHR(hr);
}
hr = filter.Init(0, 0, null, out _);
if (hr < 0)
{
if (!throwOnError)
yield break;
Marshal.ThrowExceptionForHR(hr);
}
do
{
hr = filter.GetChunk(out var chunk);
if (hr < 0)
break;
if (chunk.flags.HasFlag(CHUNKSTATE.CHUNK_TEXT))
{
do
{
var size = chunkText.Capacity;
hr = filter.GetText(ref size, chunkText);
if (hr < 0)
break;
sb.Append(chunkText);
const int FILTER_S_LAST_TEXT = 0x41709;
if (hr == FILTER_S_LAST_TEXT)
break;
}
while (true);
yield return new PdfObject { Value = sb.ToString(), IsText = true };
}
if (chunk.flags.HasFlag(CHUNKSTATE.CHUNK_VALUE))
{
string name;
if (chunk.attribute.psProperty.ulKind == PRSPEC.PRSPEC_PROPID)
{
var pk = new PROPERTYKEY { fmtid = chunk.attribute.guidPropSet, propid = chunk.attribute.psProperty.propid.ToInt32() };
if (PSGetNameFromPropertyKey(ref pk, out name) < 0)
{
name = chunk.attribute.guidPropSet.ToString("B") + " " + chunk.attribute.psProperty.propid;
}
}
else if (chunk.attribute.psProperty.ulKind == PRSPEC.PRSPEC_LPWSTR && chunk.attribute.psProperty.propid != IntPtr.Zero)
{
name = Marshal.PtrToStringUni(chunk.attribute.psProperty.propid);
}
else
{
name = "???";
}
if (filter.GetValue(out var pv) >= 0)
{
if (PropVariantToVariant(pv, out var value) >= 0)
{
yield return new PdfObject { Value = value, PropertyName = name };
}
PropVariantClear(pv);
}
}
}
while (true);
}
[DllImport("propsys")] // .NET knows how to handle VARIANT, not PROPVARIANT
private static extern int PropVariantToVariant(IntPtr pPropVar, out object pVar);
[DllImport("propsys")]
private static extern int PSGetNameFromPropertyKey(ref PROPERTYKEY propkey, [MarshalAs(UnmanagedType.LPWStr)] out string ppszCanonicalName);
[DllImport("ole32")]
private static extern int PropVariantClear(IntPtr pvar);
[ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF"), InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
private interface IFilter
{
[PreserveSig]
int Init(IFILTER_INIT grfFlags, int cAttributes, FULLPROPSPEC[] aAttributes, out IFILTER_FLAGS pFlags);
[PreserveSig]
int GetChunk(out STAT_CHUNK pStat);
[PreserveSig]
int GetText(ref int pcwcBuffer, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder awcBuffer);
[PreserveSig]
int GetValue(out IntPtr ppPropValue);
[PreserveSig]
int BindRegion(FILTERREGION origPos, [MarshalAs(UnmanagedType.LPStruct)] Guid riid, IntPtr ppunk);
}
[ComImport, Guid("b824b49d-22ac-4161-ac8a-9916e8fa3f7f"), InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
private interface IInitializeWithStream
{
[PreserveSig]
int Initialize(IStream grfFlags, int mode);
}
[StructLayout(LayoutKind.Sequential)]
private struct FULLPROPSPEC
{
public Guid guidPropSet;
public PROPSPEC psProperty;
}
[StructLayout(LayoutKind.Sequential)]
private struct PROPSPEC
{
public PRSPEC ulKind;
public IntPtr propid;// or lpwstr;
}
[StructLayout(LayoutKind.Sequential)]
private struct PROPERTYKEY
{
public Guid fmtid;
public int propid;
}
[StructLayout(LayoutKind.Sequential)]
private struct STAT_CHUNK
{
public int idChunk;
public CHUNK_BREAKTYPE breakType;
public CHUNKSTATE flags;
public int locale;
public FULLPROPSPEC attribute;
public int idChunkSource;
public int cwcStartSource;
public int cwcLenSource;
}
[StructLayout(LayoutKind.Sequential)]
private struct FILTERREGION
{
public int idChunk;
public int cwcStart;
public int cwcExtent;
}
private enum CHUNK_BREAKTYPE
{
CHUNK_NO_BREAK = 0,
CHUNK_EOW = 1,
CHUNK_EOS = 2,
CHUNK_EOP = 3,
CHUNK_EOC = 4
}
[Flags]
private enum CHUNKSTATE
{
CHUNK_TEXT = 0x1,
CHUNK_VALUE = 0x2,
CHUNK_FILTER_OWNED_VALUE = 0x4
}
[Flags]
private enum IFILTER_INIT
{
IFILTER_INIT_CANON_PARAGRAPHS = 0x0001,
IFILTER_INIT_HARD_LINE_BREAKS = 0x0002,
IFILTER_INIT_CANON_HYPHENS = 0x0004,
IFILTER_INIT_CANON_SPACES = 0x0008,
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 0x0010,
IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 0x0020,
IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 0x0100,
IFILTER_INIT_INDEXING_ONLY = 0x0040,
IFILTER_INIT_SEARCH_LINKS = 0x0080,
IFILTER_INIT_FILTER_OWNED_VALUE_OK = 0x0200,
IFILTER_INIT_FILTER_AGGRESSIVE_BREAK = 0x0400,
IFILTER_INIT_DISABLE_EMBEDDED = 0x0800,
IFILTER_INIT_EMIT_FORMATTING = 0x1000,
}
private enum IFILTER_FLAGS
{
IFILTER_FLAGS_OLE_PROPERTIES = 1,
}
private enum PRSPEC
{
PRSPEC_LPWSTR = 0,
PRSPEC_PROPID = 1,
}
[ComVisible(true)]
private sealed class ManagedIStream : IStream
{
private readonly Stream _stream;
public ManagedIStream(Stream stream) => _stream = stream;
public void Read(byte[] pv, int cb, IntPtr pcbRead)
{
var read = _stream.Read(pv, 0, cb);
if (pcbRead != IntPtr.Zero) { Marshal.WriteInt32(pcbRead, read); }
}
public void Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition)
{
var newPos = _stream.Seek(dlibMove, (SeekOrigin)dwOrigin);
if (plibNewPosition != IntPtr.Zero) { Marshal.WriteInt64(plibNewPosition, newPos); }
}
public void Stat(out System.Runtime.InteropServices.ComTypes.STATSTG pstatstg, int grfStatFlag)
{
var stream = _stream;
pstatstg = new System.Runtime.InteropServices.ComTypes.STATSTG
{
cbSize = stream.Length,
type = 2, // STGTY_STREAM
grfMode = 0, // STGM_READ,
};
}
public void Write(byte[] pv, int cb, IntPtr pcbWritten) => throw new NotSupportedException();
public void CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten) => throw new NotSupportedException();
public void SetSize(long libNewSize) => throw new NotSupportedException();
public void Commit(int grfCommitFlags) => throw new NotSupportedException();
public void Revert() => throw new NotSupportedException();
public void LockRegion(long libOffset, long cb, int dwLockType) => throw new NotSupportedException();
public void UnlockRegion(long libOffset, long cb, int dwLockType) => throw new NotSupportedException();
public void Clone(out IStream ppstm) => throw new NotSupportedException();
}
}
}