Introduction to OCR
Optical Character Recognition (OCR) is the technology that converts different types of documents—scanned paper documents, PDF files, or images captured by a digital camera—into editable and searchable data. This comprehensive guide covers everything from basic concepts to building production-grade OCR systems.
1. OCR Pipeline Overview
A complete OCR pipeline consists of several stages:
- Image Acquisition: Capturing or loading the input image
- Preprocessing: Enhancing image quality for better recognition
- Text Detection: Locating text regions in the image
- Text Recognition: Converting detected regions to text
- Post-processing: Correcting errors and formatting output
2. Image Preprocessing Techniques
Essential Preprocessing Steps
import cv2
import numpy as np
def preprocess_for_ocr(image_path):
# Read image
img = cv2.imread(image_path)
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Noise reduction
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
# Binarization using Otsu's method
_, binary = cv2.threshold(
denoised, 0, 255,
cv2.THRESH_BINARY + cv2.THRESH_OTSU
)
# Deskewing
coords = np.column_stack(np.where(binary > 0))
angle = cv2.minAreaRect(coords)[-1]
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
(h, w) = binary.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
deskewed = cv2.warpAffine(
binary, M, (w, h),
flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE
)
return deskewedHandling Difficult Documents
def adaptive_preprocessing(image):
"""Handle varying lighting conditions"""
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Adaptive thresholding for uneven lighting
adaptive = cv2.adaptiveThreshold(
gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2
)
# Morphological operations to clean up
kernel = np.ones((1, 1), np.uint8)
cleaned = cv2.morphologyEx(adaptive, cv2.MORPH_CLOSE, kernel)
cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
return cleaned3. Tesseract OCR
Basic Usage
import pytesseract
from PIL import Image
# Basic OCR
text = pytesseract.image_to_string(Image.open('document.png'))
# With language specification
text_id = pytesseract.image_to_string(
Image.open('document.png'),
lang='ind+eng' # Indonesian + English
)
# Get detailed output with bounding boxes
data = pytesseract.image_to_data(
Image.open('document.png'),
output_type=pytesseract.Output.DICT
)
# Extract structured data
for i, word in enumerate(data['text']):
if word.strip():
x, y, w, h = (
data['left'][i], data['top'][i],
data['width'][i], data['height'][i]
)
conf = data['conf'][i]
print(f"Word: {word}, Confidence: {conf}%, Box: ({x},{y},{w},{h})")Configuration Options
# PSM (Page Segmentation Mode) options
PSM_MODES = {
0: "Orientation and script detection only",
1: "Automatic page segmentation with OSD",
3: "Fully automatic page segmentation (default)",
4: "Assume single column of text",
6: "Assume single uniform block of text",
7: "Treat image as single text line",
8: "Treat image as single word",
11: "Sparse text - find as much text as possible",
13: "Raw line - treat image as single text line"
}
# Custom configuration
custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789'
numbers_only = pytesseract.image_to_string(image, config=custom_config)4. Deep Learning OCR with CRNN
CRNN Architecture
The Convolutional Recurrent Neural Network combines CNNs for feature extraction with RNNs for sequence modeling:
import torch
import torch.nn as nn
class CRNN(nn.Module):
def __init__(self, img_height, num_channels, num_classes, hidden_size=256):
super(CRNN, self).__init__()
# CNN Feature Extractor
self.cnn = nn.Sequential(
nn.Conv2d(num_channels, 64, 3, 1, 1), nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 128, 3, 1, 1), nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256), nn.ReLU(),
nn.Conv2d(256, 256, 3, 1, 1), nn.ReLU(),
nn.MaxPool2d((2, 1), (2, 1)),
nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512), nn.ReLU(),
nn.Conv2d(512, 512, 3, 1, 1), nn.ReLU(),
nn.MaxPool2d((2, 1), (2, 1)),
nn.Conv2d(512, 512, 2, 1, 0), nn.ReLU()
)
# RNN Sequence Modeling
self.rnn = nn.LSTM(
512, hidden_size,
num_layers=2,
bidirectional=True,
batch_first=True
)
# Output Layer
self.fc = nn.Linear(hidden_size * 2, num_classes)
def forward(self, x):
# CNN features
conv = self.cnn(x) # [B, C, H, W]
# Reshape for RNN
b, c, h, w = conv.size()
conv = conv.squeeze(2).permute(0, 2, 1) # [B, W, C]
# RNN
rnn_out, _ = self.rnn(conv) # [B, W, hidden*2]
# Output
output = self.fc(rnn_out) # [B, W, num_classes]
return output5. Modern OCR with Transformers
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
# Load TrOCR model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-printed")
def ocr_with_trocr(image_path):
image = Image.open(image_path).convert("RGB")
# Process image
pixel_values = processor(image, return_tensors="pt").pixel_values
# Generate text
generated_ids = model.generate(pixel_values, max_length=128)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return text6. End-to-End OCR Pipeline
class OCRPipeline:
def __init__(self):
self.text_detector = self._load_detector()
self.text_recognizer = self._load_recognizer()
self.spell_checker = SpellChecker()
def process(self, image_path):
# Load and preprocess
image = cv2.imread(image_path)
preprocessed = self.preprocess(image)
# Detect text regions
boxes = self.text_detector.detect(preprocessed)
# Recognize text in each region
results = []
for box in boxes:
cropped = self.crop_region(preprocessed, box)
text = self.text_recognizer.recognize(cropped)
# Post-process
corrected = self.spell_checker.correct(text)
results.append({
'text': corrected,
'confidence': self.text_recognizer.confidence,
'bbox': box
})
return self.format_output(results)
def preprocess(self, image):
# Apply preprocessing pipeline
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
denoised = cv2.fastNlMeansDenoising(gray)
binary = cv2.adaptiveThreshold(
denoised, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2
)
return binaryConclusion
Building robust OCR systems requires understanding both traditional image processing techniques and modern deep learning approaches. The best results often come from combining multiple methods and careful preprocessing tailored to your specific document types.
Key Takeaways
- Preprocessing is critical for OCR accuracy
- Choose the right tool: Tesseract for general use, deep learning for complex cases
- Modern Transformer-based models like TrOCR achieve state-of-the-art results
- Always implement post-processing for error correction
- Consider document-specific optimizations for production systems