Abbyy Finereader Python ✦ < Fast >

return result.returncode fine_read_cli("scan.jpg", "output/result", "docx") Batch Processing with CLI from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm def batch_ocr_cli(input_folder, output_folder, max_workers=4): """Process all images in a folder.""" input_folder = Path(input_folder) output_folder = Path(output_folder) output_folder.mkdir(exist_ok=True)

doc.Recognize("English") doc.Export(output_pdf_path, "PDF", export_params) doc.Close()

def submit_ocr_task(self, file_path, output_format="pdf"): """Submit a file for OCR processing.""" with open(file_path, 'rb') as f: files = 'file': (Path(file_path).name, f) data = 'outputFormat': output_format, 'language': 'English', 'recognitionAccuracy': 'high', 'documentProcessingMode': 'auto' response = self.session.post( f"self.base_url/api/v1/tasks", files=files, data=data ) return response.json()['taskId']

def ocr_with_retry(max_retries=3): def decorator(func): @wraps(func) def wrapper(*args, **kwargs): for attempt in range(max_retries): try: return func(*args, **kwargs) except Exception as e: logger.error(f"Attempt attempt+1 failed: e") if attempt == max_retries - 1: raise time.sleep(2 ** attempt) # Exponential backoff return wrapper return decorator abbyy finereader python

return result import logging from functools import wraps logging.basicConfig(level=logging.INFO) logger = logging.getLogger( name )

# Configure PDF export settings export_params = "PDFExportMode": 1, # 1 = Text and pictures (searchable) "PDFAComplianceMode": 1, # PDF/A-1b "PreserveOriginalPageSize": True

result = fine_read_cli(input_path, "temp", "txt") # Your OCR call with open(cache_file, 'wb') as f: pickle.dump(result, f) return result

def ocr_document(self, input_path, output_path, output_format="docx", language="English"): """OCR a single document with full control.""" # Create document object doc = self.app.CreateDocument() # Add image page page = doc.AddImageFile(input_path, 0) # 0 = auto orientation # Analyze layout doc.AnalyzeLayout() # Recognize with specific language doc.Recognize(language) # Export if output_format == "docx": doc.Export(output_path, "DOCX") elif output_format == "txt": doc.Export(output_path, "TEXT") elif output_format == "pdf": doc.Export(output_path, "PDF") # Cleanup doc.Close() return output_path

def wait_and_download(self, file_path, output_path, poll_interval=2): """Submit and wait for completion.""" task_id = self.submit_ocr_task(file_path) while True: status = self.get_task_status(task_id) if status['state'] == 'completed': return self.download_result(task_id, output_path) elif status['state'] == 'failed': raise Exception(f"OCR failed: status.get('error', 'Unknown error')") time.sleep(poll_interval) client = FineReaderServerClient( base_url="http://localhost:8080", username="admin", password="secret" )

def _clean_invoice_number(self, raw): match = re.search(r'INV[-_]?\d5,10', raw) return match.group(0) if match else raw Initialize FineReader COM Object import win32com

# Initialize (choose method) fr = FineReaderCOM() # Requires Windows

Args: input_path: Path to image or PDF output_path: Output file path (without extension) output_format: pdf, docx, xlsx, txt, html """ fine_cmd = r"C:\Program Files (x86)\ABBYY FineReader\FineReaderCmd.exe"

@ocr_with_retry(max_retries=3) def robust_ocr(input_path): # Your OCR implementation pass | Limitation | Alternative | |------------|-------------| | Windows-only (COM method) | Use CLI or Server API | | License required | Tesseract (free), Google Cloud Vision | | Slow for large batches | Use FineReader Server (distributed) | | Complex layout handling | Adobe Extract API | 11. Complete Working Example # full_pipeline.py import os from pathlib import Path import json from datetime import datetime def main(): # Setup input_folder = "./input_scans" output_folder = "./ocr_results" os.makedirs(output_folder, exist_ok=True)

with ThreadPoolExecutor(max_workers=max_workers) as executor: list(tqdm(executor.map(process_one, image_files), total=len(image_files))) batch_ocr_cli("./scans", "./ocr_output", max_workers=2) 5. Method 2: COM Automation (Windows, Deep Control) This method gives you programmatic access to FineReader's object model. Initialize FineReader COM Object import win32com.client import pythoncom import os class FineReaderCOM: def init (self): pythoncom.CoInitialize() self.app = win32com.client.Dispatch("FineReader.Application") self.app.Visible = False # Run in background