Your Portfolio

Introduction to OCR

Optical Character Recognition (OCR) is the technology that converts different types of documents—scanned paper documents, PDF files, or images captured by a digital camera—into editable and searchable data. This comprehensive guide covers everything from basic concepts to building production-grade OCR systems.

1. OCR Pipeline Overview

A complete OCR pipeline consists of several stages:

Image Acquisition: Capturing or loading the input image
Preprocessing: Enhancing image quality for better recognition
Text Detection: Locating text regions in the image
Text Recognition: Converting detected regions to text
Post-processing: Correcting errors and formatting output

2. Image Preprocessing Techniques

Essential Preprocessing Steps

import cv2
import numpy as np

def preprocess_for_ocr(image_path):
    # Read image
    img = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Noise reduction
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    
    # Binarization using Otsu's method
    _, binary = cv2.threshold(
        denoised, 0, 255, 
        cv2.THRESH_BINARY + cv2.THRESH_OTSU
    )
    
    # Deskewing
    coords = np.column_stack(np.where(binary > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    
    (h, w) = binary.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    deskewed = cv2.warpAffine(
        binary, M, (w, h), 
        flags=cv2.INTER_CUBIC,
        borderMode=cv2.BORDER_REPLICATE
    )
    
    return deskewed

Handling Difficult Documents

def adaptive_preprocessing(image):
    """Handle varying lighting conditions"""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Adaptive thresholding for uneven lighting
    adaptive = cv2.adaptiveThreshold(
        gray, 255, 
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 11, 2
    )
    
    # Morphological operations to clean up
    kernel = np.ones((1, 1), np.uint8)
    cleaned = cv2.morphologyEx(adaptive, cv2.MORPH_CLOSE, kernel)
    cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
    
    return cleaned

3. Tesseract OCR

Basic Usage

import pytesseract
from PIL import Image

# Basic OCR
text = pytesseract.image_to_string(Image.open('document.png'))

# With language specification
text_id = pytesseract.image_to_string(
    Image.open('document.png'), 
    lang='ind+eng'  # Indonesian + English
)

# Get detailed output with bounding boxes
data = pytesseract.image_to_data(
    Image.open('document.png'), 
    output_type=pytesseract.Output.DICT
)

# Extract structured data
for i, word in enumerate(data['text']):
    if word.strip():
        x, y, w, h = (
            data['left'][i], data['top'][i],
            data['width'][i], data['height'][i]
        )
        conf = data['conf'][i]
        print(f"Word: {word}, Confidence: {conf}%, Box: ({x},{y},{w},{h})")

Configuration Options

# PSM (Page Segmentation Mode) options
PSM_MODES = {
    0: "Orientation and script detection only",
    1: "Automatic page segmentation with OSD",
    3: "Fully automatic page segmentation (default)",
    4: "Assume single column of text",
    6: "Assume single uniform block of text",
    7: "Treat image as single text line",
    8: "Treat image as single word",
    11: "Sparse text - find as much text as possible",
    13: "Raw line - treat image as single text line"
}

# Custom configuration
custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789'
numbers_only = pytesseract.image_to_string(image, config=custom_config)

4. Deep Learning OCR with CRNN

CRNN Architecture

The Convolutional Recurrent Neural Network combines CNNs for feature extraction with RNNs for sequence modeling:

import torch
import torch.nn as nn

class CRNN(nn.Module):
    def __init__(self, img_height, num_channels, num_classes, hidden_size=256):
        super(CRNN, self).__init__()
        
        # CNN Feature Extractor
        self.cnn = nn.Sequential(
            nn.Conv2d(num_channels, 64, 3, 1, 1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, 1, 1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256), nn.ReLU(),
            nn.Conv2d(256, 256, 3, 1, 1), nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),
            nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512), nn.ReLU(),
            nn.Conv2d(512, 512, 3, 1, 1), nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),
            nn.Conv2d(512, 512, 2, 1, 0), nn.ReLU()
        )
        
        # RNN Sequence Modeling
        self.rnn = nn.LSTM(
            512, hidden_size, 
            num_layers=2, 
            bidirectional=True,
            batch_first=True
        )
        
        # Output Layer
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, x):
        # CNN features
        conv = self.cnn(x)  # [B, C, H, W]
        
        # Reshape for RNN
        b, c, h, w = conv.size()
        conv = conv.squeeze(2).permute(0, 2, 1)  # [B, W, C]
        
        # RNN
        rnn_out, _ = self.rnn(conv)  # [B, W, hidden*2]
        
        # Output
        output = self.fc(rnn_out)  # [B, W, num_classes]
        return output

5. Modern OCR with Transformers

from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

# Load TrOCR model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-printed")

def ocr_with_trocr(image_path):
    image = Image.open(image_path).convert("RGB")
    
    # Process image
    pixel_values = processor(image, return_tensors="pt").pixel_values
    
    # Generate text
    generated_ids = model.generate(pixel_values, max_length=128)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return text

6. End-to-End OCR Pipeline

class OCRPipeline:
    def __init__(self):
        self.text_detector = self._load_detector()
        self.text_recognizer = self._load_recognizer()
        self.spell_checker = SpellChecker()
    
    def process(self, image_path):
        # Load and preprocess
        image = cv2.imread(image_path)
        preprocessed = self.preprocess(image)
        
        # Detect text regions
        boxes = self.text_detector.detect(preprocessed)
        
        # Recognize text in each region
        results = []
        for box in boxes:
            cropped = self.crop_region(preprocessed, box)
            text = self.text_recognizer.recognize(cropped)
            
            # Post-process
            corrected = self.spell_checker.correct(text)
            results.append({
                'text': corrected,
                'confidence': self.text_recognizer.confidence,
                'bbox': box
            })
        
        return self.format_output(results)
    
    def preprocess(self, image):
        # Apply preprocessing pipeline
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        denoised = cv2.fastNlMeansDenoising(gray)
        binary = cv2.adaptiveThreshold(
            denoised, 255, 
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, 11, 2
        )
        return binary

Conclusion

Building robust OCR systems requires understanding both traditional image processing techniques and modern deep learning approaches. The best results often come from combining multiple methods and careful preprocessing tailored to your specific document types.

Key Takeaways

Preprocessing is critical for OCR accuracy
Choose the right tool: Tesseract for general use, deep learning for complex cases
Modern Transformer-based models like TrOCR achieve state-of-the-art results
Always implement post-processing for error correction
Consider document-specific optimizations for production systems