See the question and my original answer on StackOverflow

Here is a modified version of the original code that only captures a portion of the screen into a buffer, and also gives back the stride. Then it browses all the pixels, dumps their colors as a sample usage of the returned buffer.

In this sample, the buffer is allocated by the function, so you must free it once you've used it:

// sample usage
int main()
{
  LONG left = 10;
  LONG top = 10;
  LONG width = 100;
  LONG height = 100;
  LPBYTE buffer;
  UINT stride;
  RECT rc = { left, top, left + width, top + height };
  Direct3D9TakeScreenshot(D3DADAPTER_DEFAULT, &buffer, &stride, &rc);

  // In 32bppPBGRA format, each pixel is represented by 4 bytes
  // with one byte each for blue, green, red, and the alpha channel, in that order.
  // But don't forget this is all modulo endianness ...
  // So, on Intel architecture, if we read a pixel from memory
  // as a DWORD, it's reversed (ARGB). The macros below handle that.

  // browse every pixel by line
  for (int h = 0; h < height; h++)
  {
    LPDWORD pixels = (LPDWORD)(buffer + h * stride);
    for (int w = 0; w < width; w++)
    {
      DWORD pixel = pixels[w];
      wprintf(L"#%02X#%02X#%02X#%02X\n", GetBGRAPixelAlpha(pixel), GetBGRAPixelRed(pixel), GetBGRAPixelGreen(pixel), GetBGRAPixelBlue(pixel));
    }
  }

  // get pixel at 50, 50 in the buffer, as #ARGB
  DWORD pixel = GetBGRAPixel(buffer, stride, 50, 50);
  wprintf(L"#%02X#%02X#%02X#%02X\n", GetBGRAPixelAlpha(pixel), GetBGRAPixelRed(pixel), GetBGRAPixelGreen(pixel), GetBGRAPixelBlue(pixel));

  SavePixelsToFile32bppPBGRA(width, height, stride, buffer, L"test.png", GUID_ContainerFormatPng);
  LocalFree(buffer);
  return 0;;
}

#define GetBGRAPixelBlue(p)         (LOBYTE(p))
#define GetBGRAPixelGreen(p)        (HIBYTE(p))
#define GetBGRAPixelRed(p)          (LOBYTE(HIWORD(p)))
#define GetBGRAPixelAlpha(p)        (HIBYTE(HIWORD(p)))
#define GetBGRAPixel(b,s,x,y)       (((LPDWORD)(((LPBYTE)b) + y * s))[x])

int main()

HRESULT Direct3D9TakeScreenshot(UINT adapter, LPBYTE *pBuffer, UINT *pStride, const RECT *pInputRc = nullptr)
{
  if (!pBuffer || !pStride) return E_INVALIDARG;

  HRESULT hr = S_OK;
  IDirect3D9 *d3d = nullptr;
  IDirect3DDevice9 *device = nullptr;
  IDirect3DSurface9 *surface = nullptr;
  D3DPRESENT_PARAMETERS parameters = { 0 };
  D3DDISPLAYMODE mode;
  D3DLOCKED_RECT rc;

  *pBuffer = NULL;
  *pStride = 0;

  // init D3D and get screen size
  d3d = Direct3DCreate9(D3D_SDK_VERSION);
  HRCHECK(d3d->GetAdapterDisplayMode(adapter, &mode));

  LONG width = pInputRc ? (pInputRc->right - pInputRc->left) : mode.Width;
  LONG height = pInputRc ? (pInputRc->bottom - pInputRc->top) : mode.Height;

  parameters.Windowed = TRUE;
  parameters.BackBufferCount = 1;
  parameters.BackBufferHeight = height;
  parameters.BackBufferWidth = width;
  parameters.SwapEffect = D3DSWAPEFFECT_DISCARD;
  parameters.hDeviceWindow = NULL;

  // create device & capture surface (note it needs desktop size, not our capture size)
  HRCHECK(d3d->CreateDevice(adapter, D3DDEVTYPE_HAL, NULL, D3DCREATE_SOFTWARE_VERTEXPROCESSING, &parameters, &device));
  HRCHECK(device->CreateOffscreenPlainSurface(mode.Width, mode.Height, D3DFMT_A8R8G8B8, D3DPOOL_SYSTEMMEM, &surface, nullptr));

  // get pitch/stride to compute the required buffer size
  HRCHECK(surface->LockRect(&rc, pInputRc, 0));
  *pStride = rc.Pitch;
  HRCHECK(surface->UnlockRect());

  // allocate buffer
  *pBuffer = (LPBYTE)LocalAlloc(0, *pStride * height);
  if (!*pBuffer)
  {
    hr = E_OUTOFMEMORY;
    goto cleanup;
  }

  // get the data
  HRCHECK(device->GetFrontBufferData(0, surface));

  // copy it into our buffer
  HRCHECK(surface->LockRect(&rc, pInputRc, 0));
  CopyMemory(*pBuffer, rc.pBits, rc.Pitch * height);
  HRCHECK(surface->UnlockRect());

cleanup:
  if (FAILED(hr))
  {
    if (*pBuffer)
    {
      LocalFree(*pBuffer);
      *pBuffer = NULL;
    }
    *pStride = 0;
  }
  RELEASE(surface);
  RELEASE(device);
  RELEASE(d3d);
  return hr;
}