在GdPicture.NET控件中提供了OCR识别模块,利用该模块可以对图片、PDF、TIFF等文件进行文本的识别,支持多达50多种语言,如:英语、法语、意大利、德语、西班牙语、中文等,在识别时不仅可以进行多页完整识别,还可以对指定区域进行文本的识别,这篇文章主要介绍如何利用该产品对PDF文件中指定区域的识别,具体代码如下:
GdPictureImaging oGdPictureImaging = new GdPictureImaging();
GdPicturePDF oGdPicturePDF = new GdPicturePDF();
int rasterPageID = 0;
//Loading PDF
if (oGdPicturePDF.LoadFromFile("input.pdf", false) == GdPictureStatus.OK) {
//Selecting first page
oGdPicturePDF.SelectPage(1);
//rendering page to a 200 DPI image
rasterPageID = oGdPicturePDF.RenderPageToGdPictureImage(200, true);
if (rasterPageID != 0) {
//setting the Area to be OCRed
oGdPictureImaging.SetROI(0, 0, 1000, 1000);
// here we make a region of interest from (0, 0) to (1000, 1000)
//setting the OCR context
oGdPictureImaging.OCRTesseractSetOCRContext(OCRContext.OCRContextDocument);
//You can use another context if possible. This can increase accuracy
string sOCR = oGdPictureImaging.OCRTesseractDoOCR(rasterPageID, "eng", "C:\\Path\\To\\GdPicture.NET 10\\Redist\
\OCR", "");
if (oGdPictureImaging.GetStat() == GdPictureStatus.OK) {
MessageBox.Show(sOCR);
} else {
MessageBox.Show(oGdPictureImaging.GetStat().ToString());
}
} else {
MessageBox.Show(oGdPicturePDF.GetStat().ToString());
}
//closing the pdf
oGdPicturePDF.CloseDocument();
}
//Clearing resources
oGdPictureImaging.ReleaseGdPictureImage(rasterPageID);