diff --git a/examples/computer-vision/App.tsx b/examples/computer-vision/App.tsx
index 8d01269f..488c61cd 100644
--- a/examples/computer-vision/App.tsx
+++ b/examples/computer-vision/App.tsx
@@ -8,11 +8,13 @@ import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
import { View, StyleSheet } from 'react-native';
import { ClassificationScreen } from './screens/ClassificationScreen';
import { ObjectDetectionScreen } from './screens/ObjectDetectionScreen';
+import { OCRScreen } from './screens/OCRScreen';
enum ModelType {
STYLE_TRANSFER,
OBJECT_DETECTION,
CLASSIFICATION,
+ OCR,
}
export default function App() {
@@ -46,6 +48,8 @@ export default function App() {
return (
);
+ case ModelType.OCR:
+ return ;
default:
return (
@@ -64,6 +68,7 @@ export default function App() {
'Style Transfer',
'Object Detection',
'Classification',
+ 'OCR',
]}
onValueChange={(_, selectedIndex) => {
handleModeChange(selectedIndex);
diff --git a/examples/computer-vision/components/ImageWithOCRBboxes.tsx b/examples/computer-vision/components/ImageWithOCRBboxes.tsx
new file mode 100644
index 00000000..1c8fe616
--- /dev/null
+++ b/examples/computer-vision/components/ImageWithOCRBboxes.tsx
@@ -0,0 +1,103 @@
+// Import necessary components
+import React from 'react';
+import { Image, StyleSheet, View } from 'react-native';
+import Svg, { Polygon } from 'react-native-svg';
+import { OCRDetection } from 'react-native-executorch';
+
+interface Props {
+ imageUri: string;
+ detections: OCRDetection[];
+ imageWidth: number;
+ imageHeight: number;
+}
+
+export default function ImageWithOCRBboxes({
+ imageUri,
+ detections,
+ imageWidth,
+ imageHeight,
+}: Props) {
+ const [layout, setLayout] = React.useState({ width: 0, height: 0 });
+
+ const calculateAdjustedDimensions = () => {
+ const imageRatio = imageWidth / imageHeight;
+ const layoutRatio = layout.width / layout.height;
+ let sx, sy;
+ if (imageRatio > layoutRatio) {
+ sx = layout.width / imageWidth;
+ sy = layout.width / imageRatio / imageHeight;
+ } else {
+ sy = layout.height / imageHeight;
+ sx = (layout.height * imageRatio) / imageWidth;
+ }
+ return {
+ scaleX: sx,
+ scaleY: sy,
+ offsetX: (layout.width - imageWidth * sx) / 2,
+ offsetY: (layout.height - imageHeight * sy) / 2,
+ };
+ };
+
+ return (
+ {
+ const { width, height } = event.nativeEvent.layout;
+ setLayout({ width, height });
+ }}
+ >
+
+
+
+ );
+}
+
+const styles = StyleSheet.create({
+ container: {
+ flex: 1,
+ position: 'relative',
+ },
+ image: {
+ flex: 1,
+ width: '100%',
+ height: '100%',
+ },
+ svgContainer: {
+ position: 'absolute',
+ top: 0,
+ left: 0,
+ right: 0,
+ bottom: 0,
+ },
+});
diff --git a/examples/computer-vision/screens/OCRScreen.tsx b/examples/computer-vision/screens/OCRScreen.tsx
new file mode 100644
index 00000000..9d17118a
--- /dev/null
+++ b/examples/computer-vision/screens/OCRScreen.tsx
@@ -0,0 +1,112 @@
+import Spinner from 'react-native-loading-spinner-overlay';
+import { BottomBar } from '../components/BottomBar';
+import { getImage } from '../utils';
+import { useOCR } from 'react-native-executorch';
+import { View, StyleSheet, Image, Text } from 'react-native';
+import { useState } from 'react';
+import ImageWithBboxes2 from '../components/ImageWithOCRBboxes';
+
+export const OCRScreen = ({
+ imageUri,
+ setImageUri,
+}: {
+ imageUri: string;
+ setImageUri: (imageUri: string) => void;
+}) => {
+ const [results, setResults] = useState([]);
+ const [imageDimensions, setImageDimensions] = useState<{
+ width: number;
+ height: number;
+ }>();
+ const [detectedText, setDetectedText] = useState('');
+ const model = useOCR({
+ detectorSource:
+ 'https://huggingface.co/nklockiewicz/ocr/resolve/main/xnnpack_craft_800.pte',
+ recognizerSources: {
+ recognizerLarge:
+ 'https://huggingface.co/nklockiewicz/ocr/resolve/main/xnnpack_crnn_512.pte',
+ recognizerMedium:
+ 'https://huggingface.co/nklockiewicz/ocr/resolve/main/xnnpack_crnn_256.pte',
+ recognizerSmall:
+ 'https://huggingface.co/nklockiewicz/ocr/resolve/main/xnnpack_crnn_128.pte',
+ },
+ language: 'en',
+ });
+
+ const handleCameraPress = async (isCamera: boolean) => {
+ const image = await getImage(isCamera);
+ const width = image?.width;
+ const height = image?.height;
+ setImageDimensions({ width: width as number, height: height as number });
+ const uri = image?.uri;
+ if (typeof uri === 'string') {
+ setImageUri(uri as string);
+ setResults([]);
+ setDetectedText('');
+ }
+ };
+
+ const runForward = async () => {
+ try {
+ const output = await model.forward(imageUri);
+ setResults(output);
+ console.log(output);
+ let txt = '';
+ output.forEach((detection: any) => {
+ txt += detection.text + ' ';
+ });
+ setDetectedText(txt);
+ } catch (e) {
+ console.error(e);
+ }
+ };
+
+ if (!model.isReady) {
+ return (
+
+ );
+ }
+
+ return (
+ <>
+
+
+ {imageUri && imageDimensions?.width && imageDimensions?.height ? (
+
+ ) : (
+
+ )}
+
+ {detectedText}
+
+
+ >
+ );
+};
+
+const styles = StyleSheet.create({
+ image: {
+ flex: 2,
+ borderRadius: 8,
+ width: '100%',
+ },
+ imageContainer: {
+ flex: 6,
+ width: '100%',
+ padding: 16,
+ },
+});
diff --git a/ios/RnExecutorch.xcodeproj/project.pbxproj b/ios/RnExecutorch.xcodeproj/project.pbxproj
index af71112a..d7710953 100644
--- a/ios/RnExecutorch.xcodeproj/project.pbxproj
+++ b/ios/RnExecutorch.xcodeproj/project.pbxproj
@@ -37,12 +37,20 @@
LLM.h,
);
};
+ 552754CC2D394AC9006B38A2 /* Exceptions for "RnExecutorch" folder in "Compile Sources" phase from "RnExecutorch" target */ = {
+ isa = PBXFileSystemSynchronizedGroupBuildPhaseMembershipExceptionSet;
+ buildPhase = 550986852CEF541900FECBB8 /* Sources */;
+ membershipExceptions = (
+ models/ocr/utils/DetectorUtils.h,
+ );
+ };
/* End PBXFileSystemSynchronizedGroupBuildPhaseMembershipExceptionSet section */
/* Begin PBXFileSystemSynchronizedRootGroup section */
5509868B2CEF541900FECBB8 /* RnExecutorch */ = {
isa = PBXFileSystemSynchronizedRootGroup;
exceptions = (
+ 552754CC2D394AC9006B38A2 /* Exceptions for "RnExecutorch" folder in "Compile Sources" phase from "RnExecutorch" target */,
550986902CEF541900FECBB8 /* Exceptions for "RnExecutorch" folder in "Copy Files" phase from "RnExecutorch" target */,
);
path = RnExecutorch;
@@ -123,6 +131,7 @@
TargetAttributes = {
550986882CEF541900FECBB8 = {
CreatedOnToolsVersion = 16.1;
+ LastSwiftMigration = 1610;
};
};
};
@@ -275,6 +284,7 @@
550986942CEF541900FECBB8 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
+ CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
OTHER_LDFLAGS = "-ObjC";
PRODUCT_NAME = "$(TARGET_NAME)";
@@ -283,6 +293,8 @@
SUPPORTS_MACCATALYST = NO;
SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
+ SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+ SWIFT_VERSION = 6.0;
TARGETED_DEVICE_FAMILY = "1,2";
};
name = Debug;
@@ -290,6 +302,7 @@
550986952CEF541900FECBB8 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
+ CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
OTHER_LDFLAGS = "-ObjC";
PRODUCT_NAME = "$(TARGET_NAME)";
@@ -298,6 +311,7 @@
SUPPORTS_MACCATALYST = NO;
SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
+ SWIFT_VERSION = 6.0;
TARGETED_DEVICE_FAMILY = "1,2";
};
name = Release;
diff --git a/ios/RnExecutorch/OCR.h b/ios/RnExecutorch/OCR.h
new file mode 100644
index 00000000..68c08785
--- /dev/null
+++ b/ios/RnExecutorch/OCR.h
@@ -0,0 +1,7 @@
+#import
+
+constexpr CGFloat recognizerRatio = 1.6;
+
+@interface OCR : NSObject
+
+@end
diff --git a/ios/RnExecutorch/OCR.mm b/ios/RnExecutorch/OCR.mm
new file mode 100644
index 00000000..cd58d2c4
--- /dev/null
+++ b/ios/RnExecutorch/OCR.mm
@@ -0,0 +1,83 @@
+#import
+#import
+#import "OCR.h"
+#import "utils/Fetcher.h"
+#import "utils/ImageProcessor.h"
+#import "models/ocr/Detector.h"
+#import "models/ocr/RecognitionHandler.h"
+
+@implementation OCR {
+ Detector *detector;
+ RecognitionHandler *recognitionHandler;
+}
+
+RCT_EXPORT_MODULE()
+
+- (void)loadModule:(NSString *)detectorSource
+recognizerSourceLarge:(NSString *)recognizerSourceLarge
+recognizerSourceMedium:(NSString *)recognizerSourceMedium
+recognizerSourceSmall:(NSString *)recognizerSourceSmall
+ symbols:(NSString *)symbols
+ languageDictPath:(NSString *)languageDictPath
+ resolve:(RCTPromiseResolveBlock)resolve
+ reject:(RCTPromiseRejectBlock)reject {
+ detector = [[Detector alloc] init];
+ [detector loadModel:[NSURL URLWithString:detectorSource] completion:^(BOOL success, NSNumber *errorCode) {
+ if (!success) {
+ NSError *error = [NSError errorWithDomain:@"OCRErrorDomain"
+ code:[errorCode intValue]
+ userInfo:@{NSLocalizedDescriptionKey: [NSString stringWithFormat:@"%ld", (long)[errorCode longValue]]}];
+ reject(@"init_module_error", @"Failed to initialize detector module", error);
+ return;
+ }
+ [Fetcher fetchResource:[NSURL URLWithString:languageDictPath] resourceType:ResourceType::TXT completionHandler:^(NSString *filePath, NSError *error) {
+ if (error) {
+ reject(@"init_module_error", @"Failed to initialize converter module", error);
+ return;
+ }
+
+ self->recognitionHandler = [[RecognitionHandler alloc] initWithSymbols:symbols languageDictPath:filePath];
+ [self->recognitionHandler loadRecognizers:recognizerSourceLarge mediumRecognizerPath:recognizerSourceMedium smallRecognizerPath:recognizerSourceSmall completion:^(BOOL allModelsLoaded, NSNumber *errorCode) {
+ if (allModelsLoaded) {
+ resolve(@(YES));
+ } else {
+ NSError *error = [NSError errorWithDomain:@"OCRErrorDomain"
+ code:[errorCode intValue]
+ userInfo:@{NSLocalizedDescriptionKey: [NSString stringWithFormat:@"%ld", (long)[errorCode longValue]]}];
+ reject(@"init_recognizer_error", @"Failed to initialize one or more recognizer models", error);
+ }
+ }];
+ }];
+ }];
+}
+
+- (void)forward:(NSString *)input
+ resolve:(RCTPromiseResolveBlock)resolve
+ reject:(RCTPromiseRejectBlock)reject {
+ /*
+ The OCR consists of two phases:
+ 1. Detection - detecting text regions in the image, the result of this phase is a list of bounding boxes.
+ 2. Recognition - recognizing the text in the bounding boxes, the result is a list of strings and corresponding confidence scores.
+
+ Recognition uses three models, each model is resposible for recognizing text of different sizes (e.g. large - 512x64, medium - 256x64, small - 128x64).
+ */
+ @try {
+ cv::Mat image = [ImageProcessor readImage:input];
+ NSArray* result = [detector runModel:image];
+ cv::Size detectorSize = [detector getModelImageSize];
+ cv::cvtColor(image, image, cv::COLOR_BGR2GRAY);
+ result = [self->recognitionHandler recognize:result imgGray:image desiredWidth:detectorSize.width * recognizerRatio desiredHeight:detectorSize.height * recognizerRatio];
+ resolve(result);
+ } @catch (NSException *exception) {
+ reject(@"forward_error", [NSString stringWithFormat:@"%@", exception.reason],
+ nil);
+ }
+}
+
+- (std::shared_ptr)getTurboModule:
+(const facebook::react::ObjCTurboModule::InitParams &)params {
+ return std::make_shared(
+ params);
+}
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/Detector.h b/ios/RnExecutorch/models/ocr/Detector.h
new file mode 100644
index 00000000..34606972
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/Detector.h
@@ -0,0 +1,25 @@
+#import "opencv2/opencv.hpp"
+#import "BaseModel.h"
+#import "RecognitionHandler.h"
+
+constexpr CGFloat textThreshold = 0.4;
+constexpr CGFloat linkThreshold = 0.4;
+constexpr CGFloat lowTextThreshold = 0.7;
+constexpr CGFloat centerThreshold = 0.5;
+constexpr CGFloat distanceThreshold = 2.0;
+constexpr CGFloat heightThreshold = 2.0;
+constexpr CGFloat restoreRatio = 3.2;
+constexpr int minSideThreshold = 15;
+constexpr int maxSideThreshold = 30;
+constexpr int maxWidth = largeModelWidth + (largeModelWidth * 0.15);
+constexpr int minSize = 20;
+
+const cv::Scalar mean(0.485, 0.456, 0.406);
+const cv::Scalar variance(0.229, 0.224, 0.225);
+
+@interface Detector : BaseModel
+
+- (cv::Size)getModelImageSize;
+- (NSArray *)runModel:(cv::Mat &)input;
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/Detector.mm b/ios/RnExecutorch/models/ocr/Detector.mm
new file mode 100644
index 00000000..411c178d
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/Detector.mm
@@ -0,0 +1,83 @@
+#import "Detector.h"
+#import "../../utils/ImageProcessor.h"
+#import "utils/DetectorUtils.h"
+#import "utils/OCRUtils.h"
+
+/*
+ The model used as detector is based on CRAFT (Character Region Awareness for Text Detection) paper.
+ https://arxiv.org/pdf/1904.01941
+ */
+
+@implementation Detector {
+ cv::Size originalSize;
+ cv::Size modelSize;
+}
+
+- (cv::Size)getModelImageSize{
+ if(!modelSize.empty()) {
+ return modelSize;
+ }
+
+ NSArray *inputShape = [module getInputShape: @0];
+ NSNumber *widthNumber = inputShape.lastObject;
+ NSNumber *heightNumber = inputShape[inputShape.count - 2];
+
+ const int height = [heightNumber intValue];
+ const int width = [widthNumber intValue];
+ modelSize = cv::Size(height, width);
+
+ return cv::Size(height, width);
+}
+
+- (NSArray *)preprocess:(cv::Mat &)input {
+ /*
+ Detector as an input accepts tensor with a shape of [1, 3, 800, 800].
+ Due to big influence of resize to quality of recognition the image preserves original
+ aspect ratio and the missing parts are filled with padding.
+ */
+ self->originalSize = cv::Size(input.cols, input.rows);
+
+ cv::Size modelImageSize = [self getModelImageSize];
+ cv::Mat resizedImage;
+ resizedImage = [OCRUtils resizeWithPadding:input desiredWidth:modelImageSize.width desiredHeight:modelImageSize.height];
+ NSArray *modelInput = [ImageProcessor matToNSArray: resizedImage mean:mean variance:variance];
+ return modelInput;
+}
+
+- (NSArray *)postprocess:(NSArray *)output {
+ /*
+ The output of the model consists of two matrices (heat maps):
+ 1. ScoreText(Score map) - The probability of a region containing character
+ 2. ScoreAffinity(Affinity map) - affinity between characters, used to to group each character into a single instance (sequence)
+ Both matrices are 400x400
+
+ The result of this step is a list of bounding boxes that contain text.
+ */
+ NSArray *predictions = [output objectAtIndex:0];
+
+ cv::Size modelImageSize = [self getModelImageSize];
+ cv::Mat scoreTextCV, scoreAffinityCV;
+ /*
+ The output of the model is a matrix in size of input image containing two matrices representing heatmap.
+ Those two matrices are in the size of half of the input image, that's why the width and height is divided by 2.
+ */
+ [DetectorUtils interleavedArrayToMats:predictions
+ outputMat1:scoreTextCV
+ outputMat2:scoreAffinityCV
+ withSize:cv::Size(modelImageSize.width / 2, modelImageSize.height / 2)];
+ NSArray* bBoxesList = [DetectorUtils getDetBoxesFromTextMap:scoreTextCV affinityMap:scoreAffinityCV usingTextThreshold:textThreshold linkThreshold:linkThreshold lowTextThreshold:lowTextThreshold];
+ NSLog(@"Detected boxes: %lu", (unsigned long)bBoxesList.count);
+ bBoxesList = [DetectorUtils restoreBboxRatio:bBoxesList usingRestoreRatio: restoreRatio];
+ bBoxesList = [DetectorUtils groupTextBoxes:bBoxesList centerThreshold:centerThreshold distanceThreshold:distanceThreshold heightThreshold:heightThreshold minSideThreshold:minSideThreshold maxSideThreshold:maxSideThreshold maxWidth:maxWidth];
+
+ return bBoxesList;
+}
+
+- (NSArray *)runModel:(cv::Mat &)input {
+ NSArray *modelInput = [self preprocess:input];
+ NSArray *modelResult = [self forward:modelInput];
+ NSArray *result = [self postprocess:modelResult];
+ return result;
+}
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/RecognitionHandler.h b/ios/RnExecutorch/models/ocr/RecognitionHandler.h
new file mode 100644
index 00000000..72ec004f
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/RecognitionHandler.h
@@ -0,0 +1,16 @@
+#import "opencv2/opencv.hpp"
+
+constexpr int modelHeight = 64;
+constexpr int largeModelWidth = 512;
+constexpr int mediumModelWidth = 256;
+constexpr int smallModelWidth = 128;
+constexpr CGFloat lowConfidenceThreshold = 0.3;
+constexpr CGFloat adjustContrast = 0.2;
+
+@interface RecognitionHandler : NSObject
+
+- (instancetype)initWithSymbols:(NSString *)symbols languageDictPath:(NSString *)languageDictPath;
+- (void)loadRecognizers:(NSString *)largeRecognizerPath mediumRecognizerPath:(NSString *)mediumRecognizerPath smallRecognizerPath:(NSString *)smallRecognizerPath completion:(void (^)(BOOL, NSNumber *))completion;
+- (NSArray *)recognize:(NSArray *)bBoxesList imgGray:(cv::Mat)imgGray desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight;
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/RecognitionHandler.mm b/ios/RnExecutorch/models/ocr/RecognitionHandler.mm
new file mode 100644
index 00000000..50e303df
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/RecognitionHandler.mm
@@ -0,0 +1,124 @@
+#import
+#import "ExecutorchLib/ETModel.h"
+#import "../../utils/Fetcher.h"
+#import "../../utils/ImageProcessor.h"
+#import "./utils/CTCLabelConverter.h"
+#import "./utils/OCRUtils.h"
+#import "./utils/RecognizerUtils.h"
+#import "Recognizer.h"
+#import "RecognitionHandler.h"
+
+/*
+ RecognitionHandler class is responsible for loading and choosing the appropriate recognizer model based on the input image size,
+ it also handles converting the model output to text.
+ */
+
+@implementation RecognitionHandler {
+ Recognizer *recognizerLarge;
+ Recognizer *recognizerMedium;
+ Recognizer *recognizerSmall;
+ CTCLabelConverter *converter;
+}
+
+- (instancetype)initWithSymbols:(NSString *)symbols languageDictPath:(NSString *)languageDictPath {
+ self = [super init];
+ if (self) {
+ recognizerLarge = [[Recognizer alloc] init];
+ recognizerMedium = [[Recognizer alloc] init];
+ recognizerSmall = [[Recognizer alloc] init];
+
+ converter = [[CTCLabelConverter alloc] initWithCharacters:symbols separatorList:@{} dictPathList:@{@"key": languageDictPath}];
+ }
+ return self;
+}
+
+- (void)loadRecognizers:(NSString *)largeRecognizerPath mediumRecognizerPath:(NSString *)mediumRecognizerPath smallRecognizerPath:(NSString *)smallRecognizerPath completion:(void (^)(BOOL, NSNumber *))completion {
+ dispatch_group_t group = dispatch_group_create();
+ __block BOOL allSuccessful = YES;
+
+ NSArray *recognizers = @[recognizerLarge, recognizerMedium, recognizerSmall];
+ NSArray *paths = @[largeRecognizerPath, mediumRecognizerPath, smallRecognizerPath];
+
+ for (NSInteger i = 0; i < recognizers.count; i++) {
+ Recognizer *recognizer = recognizers[i];
+ NSString *path = paths[i];
+
+ dispatch_group_enter(group);
+ [recognizer loadModel:[NSURL URLWithString: path] completion:^(BOOL success, NSNumber *errorCode) {
+ if (!success) {
+ allSuccessful = NO;
+ dispatch_group_leave(group);
+ completion(NO, errorCode);
+ return;
+ }
+ dispatch_group_leave(group);
+ }];
+ }
+
+ dispatch_group_notify(group, dispatch_get_main_queue(), ^{
+ if (allSuccessful) {
+ completion(YES, @(0));
+ }
+ });
+}
+
+- (NSArray *)runModel:(cv::Mat)croppedImage {
+ NSArray *result;
+ if(croppedImage.cols >= largeModelWidth) {
+ result = [recognizerLarge runModel:croppedImage];
+ } else if (croppedImage.cols >= mediumModelWidth) {
+ result = [recognizerMedium runModel: croppedImage];
+ } else {
+ result = [recognizerSmall runModel: croppedImage];
+ }
+
+ return result;
+}
+
+- (NSArray *)recognize: (NSArray *)bBoxesList imgGray:(cv::Mat)imgGray desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight {
+ NSDictionary* ratioAndPadding = [RecognizerUtils calculateResizeRatioAndPaddings:imgGray.cols height:imgGray.rows desiredWidth:desiredWidth desiredHeight:desiredHeight];
+ const int left = [ratioAndPadding[@"left"] intValue];
+ const int top = [ratioAndPadding[@"top"] intValue];
+ const CGFloat resizeRatio = [ratioAndPadding[@"resizeRatio"] floatValue];
+ imgGray = [OCRUtils resizeWithPadding:imgGray desiredWidth:desiredWidth desiredHeight:desiredHeight];
+
+ NSMutableArray *predictions = [NSMutableArray array];
+ for (NSDictionary *box in bBoxesList) {
+ cv::Mat croppedImage = [RecognizerUtils getCroppedImage:box image:imgGray modelHeight:modelHeight];
+ if (croppedImage.empty()) {
+ continue;
+ }
+ croppedImage = [RecognizerUtils normalizeForRecognizer:croppedImage adjustContrast:adjustContrast];
+ NSArray *result = [self runModel: croppedImage];
+
+
+ NSNumber *confidenceScore = [result objectAtIndex:1];
+ if([confidenceScore floatValue] < lowConfidenceThreshold){
+ cv::rotate(croppedImage, croppedImage, cv::ROTATE_180);
+
+ NSArray *rotatedResult = [self runModel: croppedImage];
+ NSNumber *rotatedConfidenceScore = [rotatedResult objectAtIndex:1];
+
+ if ([rotatedConfidenceScore floatValue] > [confidenceScore floatValue]) {
+ result = rotatedResult;
+ confidenceScore = rotatedConfidenceScore;
+ }
+ }
+
+ NSArray *predIndex = [result objectAtIndex:0];
+ NSArray* decodedTexts = [converter decodeGreedy:predIndex length:(int)(predIndex.count)];
+
+ NSMutableArray *bbox = [NSMutableArray arrayWithCapacity:4];
+ for (NSValue *coords in box[@"bbox"]){
+ const CGPoint point = [coords CGPointValue];
+ [bbox addObject: @{@"x": @((point.x - left) * resizeRatio), @"y": @((point.y - top) * resizeRatio)}];
+ }
+
+ NSDictionary *res = @{@"text": decodedTexts[0], @"bbox": bbox, @"score": confidenceScore};
+ [predictions addObject:res];
+ }
+
+ return predictions;
+}
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/Recognizer.h b/ios/RnExecutorch/models/ocr/Recognizer.h
new file mode 100644
index 00000000..63047ac0
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/Recognizer.h
@@ -0,0 +1,8 @@
+#import "opencv2/opencv.hpp"
+#import "BaseModel.h"
+
+@interface Recognizer : BaseModel
+
+- (NSArray *)runModel:(cv::Mat &)input;
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/Recognizer.mm b/ios/RnExecutorch/models/ocr/Recognizer.mm
new file mode 100644
index 00000000..a6d9f713
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/Recognizer.mm
@@ -0,0 +1,72 @@
+#import "Recognizer.h"
+#import "RecognizerUtils.h"
+#import "../../utils/ImageProcessor.h"
+#import "utils/OCRUtils.h"
+
+/*
+ The model used as detector is based on CRNN paper.
+ https://arxiv.org/pdf/1507.05717
+ */
+
+@implementation Recognizer {
+ cv::Size originalSize;
+}
+
+- (cv::Size)getModelImageSize{
+ NSArray *inputShape = [module getInputShape: @0];
+ NSNumber *widthNumber = inputShape.lastObject;
+ NSNumber *heightNumber = inputShape[inputShape.count - 2];
+
+ const int height = [heightNumber intValue];
+ const int width = [widthNumber intValue];
+ return cv::Size(height, width);
+}
+
+- (cv::Size)getModelOutputSize{
+ NSArray *outputShape = [module getOutputShape: @0];
+ NSNumber *widthNumber = outputShape.lastObject;
+ NSNumber *heightNumber = outputShape[outputShape.count - 2];
+
+ const int height = [heightNumber intValue];
+ const int width = [widthNumber intValue];
+ return cv::Size(height, width);
+}
+
+- (NSArray *)preprocess:(cv::Mat &)input {
+ return [ImageProcessor matToNSArrayGray:input];
+}
+
+- (NSArray *)postprocess:(NSArray *)output {
+ const int modelOutputHeight = [self getModelOutputSize].height;
+ NSInteger numElements = [output.firstObject count];
+ NSInteger numRows = (numElements + modelOutputHeight - 1) / modelOutputHeight;
+ cv::Mat resultMat = cv::Mat::zeros(numRows, modelOutputHeight, CV_32F);
+ NSInteger counter = 0;
+ NSInteger currentRow = 0;
+ for (NSNumber *num in output.firstObject) {
+ resultMat.at(currentRow, counter) = [num floatValue];
+ counter++;
+ if (counter >= modelOutputHeight) {
+ counter = 0; currentRow++;
+ }
+ }
+
+ cv::Mat probabilities = [RecognizerUtils softmax:resultMat];
+ NSMutableArray *predsNorm = [RecognizerUtils sumProbabilityRows:probabilities modelOutputHeight:modelOutputHeight];
+ probabilities = [RecognizerUtils divideMatrix:probabilities byVector:predsNorm];
+ NSArray *maxValuesIndices = [RecognizerUtils findMaxValuesAndIndices:probabilities];
+ const CGFloat confidenceScore = [RecognizerUtils computeConfidenceScore:maxValuesIndices[0] indicesArray:maxValuesIndices[1]];
+
+ return @[maxValuesIndices[1], @(confidenceScore)];
+}
+
+- (NSArray *)runModel:(cv::Mat &)input {
+ NSArray *modelInput = [self preprocess:input];
+ NSArray *modelResult = [self forward:modelInput];
+ NSArray *result = [self postprocess:modelResult];
+
+ return result;
+}
+
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.h b/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.h
new file mode 100644
index 00000000..037782f4
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.h
@@ -0,0 +1,15 @@
+#import
+
+@interface CTCLabelConverter : NSObject
+
+@property(strong, nonatomic) NSMutableDictionary *dict;
+@property(strong, nonatomic) NSArray *character;
+@property(strong, nonatomic) NSDictionary *separatorList;
+@property(strong, nonatomic) NSArray *ignoreIdx;
+@property(strong, nonatomic) NSDictionary *dictList;
+
+- (instancetype)initWithCharacters:(NSString *)characters separatorList:(NSDictionary *)separatorList dictPathList:(NSDictionary *)dictPathList;
+- (void)loadDictionariesWithDictPathList:(NSDictionary *)dictPathList;
+- (NSArray *)decodeGreedy:(NSArray *)textIndex length:(NSInteger)length;
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.mm b/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.mm
new file mode 100644
index 00000000..644a29e2
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.mm
@@ -0,0 +1,93 @@
+#import "CTCLabelConverter.h"
+
+@implementation CTCLabelConverter
+
+- (instancetype)initWithCharacters:(NSString *)characters separatorList:(NSDictionary *)separatorList dictPathList:(NSDictionary *)dictPathList {
+ self = [super init];
+ if (self) {
+ _dict = [NSMutableDictionary dictionary];
+ NSMutableArray *mutableCharacters = [NSMutableArray arrayWithObject:@"[blank]"];
+
+ for (NSUInteger i = 0; i < [characters length]; i++) {
+ NSString *charStr = [NSString stringWithFormat:@"%C", [characters characterAtIndex:i]];
+ [mutableCharacters addObject:charStr];
+ self.dict[charStr] = @(i + 1);
+ }
+
+ _character = [mutableCharacters copy];
+ _separatorList = separatorList;
+
+ NSMutableArray *ignoreIndexes = [NSMutableArray arrayWithObject:@(0)];
+ for (NSString *sep in separatorList.allValues) {
+ NSUInteger index = [characters rangeOfString:sep].location;
+ if (index != NSNotFound) {
+ [ignoreIndexes addObject:@(index)];
+ }
+ }
+ _ignoreIdx = [ignoreIndexes copy];
+ _dictList = [NSDictionary dictionary];
+ [self loadDictionariesWithDictPathList:dictPathList];
+ }
+ return self;
+}
+
+- (void)loadDictionariesWithDictPathList:(NSDictionary *)dictPathList {
+ NSMutableDictionary *tempDictList = [NSMutableDictionary dictionary];
+ for (NSString *lang in dictPathList.allKeys) {
+ NSString *dictPath = dictPathList[lang];
+ NSError *error;
+ NSString *fileContents = [NSString stringWithContentsOfFile:dictPath encoding:NSUTF8StringEncoding error:&error];
+ if (error) {
+ NSLog(@"Error reading file: %@", error.localizedDescription);
+ continue;
+ }
+ NSArray *lines = [fileContents componentsSeparatedByCharactersInSet:[NSCharacterSet newlineCharacterSet]];
+ [tempDictList setObject:lines forKey:lang];
+ }
+ _dictList = [tempDictList copy];
+}
+
+- (NSArray *)decodeGreedy:(NSArray *)textIndex length:(NSInteger)length {
+ NSMutableArray *texts = [NSMutableArray array];
+ NSUInteger index = 0;
+
+ while (index < textIndex.count) {
+ NSUInteger segmentLength = MIN(length, textIndex.count - index);
+ NSRange range = NSMakeRange(index, segmentLength);
+ NSArray *subArray = [textIndex subarrayWithRange:range];
+
+ NSMutableString *text = [NSMutableString string];
+ NSNumber *lastChar = nil;
+
+ NSMutableArray *isNotRepeated = [NSMutableArray arrayWithObject:@YES];
+ NSMutableArray *isNotIgnored = [NSMutableArray array];
+
+ for (NSUInteger i = 0; i < subArray.count; i++) {
+ NSNumber *currentChar = subArray[i];
+ if (i > 0) {
+ [isNotRepeated addObject:@(![lastChar isEqualToNumber:currentChar])];
+ }
+ [isNotIgnored addObject:@(![self.ignoreIdx containsObject:currentChar])];
+
+ lastChar = currentChar;
+ }
+
+ for (NSUInteger j = 0; j < subArray.count; j++) {
+ if ([isNotRepeated[j] boolValue] && [isNotIgnored[j] boolValue]) {
+ NSUInteger charIndex = [subArray[j] unsignedIntegerValue];
+ [text appendString:self.character[charIndex]];
+ }
+ }
+
+ [texts addObject:text.copy];
+ index += segmentLength;
+
+ if (segmentLength < length) {
+ break;
+ }
+ }
+
+ return texts.copy;
+}
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/utils/DetectorUtils.h b/ios/RnExecutorch/models/ocr/utils/DetectorUtils.h
new file mode 100644
index 00000000..8330cf98
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/utils/DetectorUtils.h
@@ -0,0 +1,21 @@
+#import
+
+constexpr int verticalLineThreshold = 20;
+
+@interface DetectorUtils : NSObject
+
++ (void)interleavedArrayToMats:(NSArray *)array
+ outputMat1:(cv::Mat &)mat1
+ outputMat2:(cv::Mat &)mat2
+ withSize:(cv::Size)size;
++ (NSArray *)getDetBoxesFromTextMap:(cv::Mat)textMap affinityMap:(cv::Mat)affinityMap usingTextThreshold:(CGFloat)textThreshold linkThreshold:(CGFloat)linkThreshold lowTextThreshold:(CGFloat)lowTextThreshold;
++ (NSArray *)restoreBboxRatio:(NSArray *)boxes usingRestoreRatio:(CGFloat)restoreRatio;
++ (NSArray *)groupTextBoxes:(NSArray *)polys
+ centerThreshold:(CGFloat)centerThreshold
+ distanceThreshold:(CGFloat)distanceThreshold
+ heightThreshold:(CGFloat)heightThreshold
+ minSideThreshold:(int)minSideThreshold
+ maxSideThreshold:(int)maxSideThreshold
+ maxWidth:(int)maxWidth;
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/utils/DetectorUtils.mm b/ios/RnExecutorch/models/ocr/utils/DetectorUtils.mm
new file mode 100644
index 00000000..5e49f1f0
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/utils/DetectorUtils.mm
@@ -0,0 +1,547 @@
+#import "DetectorUtils.h"
+
+@implementation DetectorUtils
+
++ (void)interleavedArrayToMats:(NSArray *)array
+ outputMat1:(cv::Mat &)mat1
+ outputMat2:(cv::Mat &)mat2
+ withSize:(cv::Size)size {
+ mat1 = cv::Mat(size.height, size.width, CV_32F);
+ mat2 = cv::Mat(size.height, size.width, CV_32F);
+
+ for (NSUInteger idx = 0; idx < array.count; idx++) {
+ const CGFloat value = [array[idx] doubleValue];
+ const int x = (idx / 2) % size.width;
+ const int y = (idx / 2) / size.width;
+
+ if (idx % 2 == 0) {
+ mat1.at(y, x) = value;
+ } else {
+ mat2.at(y, x) = value;
+ }
+ }
+}
+
+/**
+ * This method applies a series of image processing operations to identify likely areas of text in the textMap and return the bounding boxes for single words.
+ *
+ * @param textMap A cv::Mat representing a heat map of the characters of text being present in an image.
+ * @param affinityMap A cv::Mat representing a heat map of the affinity between characters.
+ * @param textThreshold A CGFloat representing the threshold for the text map.
+ * @param linkThreshold A CGFloat representing the threshold for the affinity map.
+ * @param lowTextThreshold A CGFloat representing the low text.
+ *
+ * @return An NSArray containing NSDictionary objects. Each dictionary includes:
+ * - "bbox": an NSArray of CGPoint values representing the vertices of the detected text box.
+ * - "angle": an NSNumber representing the rotation angle of the box.
+ */
++ (NSArray *)getDetBoxesFromTextMap:(cv::Mat)textMap affinityMap:(cv::Mat)affinityMap usingTextThreshold:(CGFloat)textThreshold linkThreshold:(CGFloat)linkThreshold lowTextThreshold:(CGFloat)lowTextThreshold {
+ const int imgH = textMap.rows;
+ const int imgW = textMap.cols;
+ cv::Mat textScore;
+ cv::Mat affinityScore;
+ cv::threshold(textMap, textScore, textThreshold, 1, cv::THRESH_BINARY);
+ cv::threshold(affinityMap, affinityScore, linkThreshold, 1, cv::THRESH_BINARY);
+ cv::Mat textScoreComb = textScore + affinityScore;
+ cv::threshold(textScoreComb, textScoreComb, 0, 1, cv::THRESH_BINARY);
+ cv::Mat binaryMat;
+ textScoreComb.convertTo(binaryMat, CV_8UC1);
+
+ cv::Mat labels, stats, centroids;
+ const int nLabels = cv::connectedComponentsWithStats(binaryMat, labels, stats, centroids, 4);
+
+ NSMutableArray *detectedBoxes = [NSMutableArray array];
+ for (int i = 1; i < nLabels; i++) {
+ const int area = stats.at(i, cv::CC_STAT_AREA);
+ if (area < 10) continue;
+
+ cv::Mat mask = (labels == i);
+ CGFloat maxVal;
+ cv::minMaxLoc(textMap, NULL, &maxVal, NULL, NULL, mask);
+ if (maxVal < lowTextThreshold) continue;
+
+ cv::Mat segMap = cv::Mat::zeros(textMap.size(), CV_8U);
+ segMap.setTo(255, mask);
+
+ const int x = stats.at(i, cv::CC_STAT_LEFT);
+ const int y = stats.at(i, cv::CC_STAT_TOP);
+ const int w = stats.at(i, cv::CC_STAT_WIDTH);
+ const int h = stats.at(i, cv::CC_STAT_HEIGHT);
+ const int dilationRadius = (int)(sqrt((double)(area / MAX(w, h)) ) * 2.0);
+ const int sx = MAX(x - dilationRadius, 0);
+ const int ex = MIN(x + w + dilationRadius + 1, imgW);
+ const int sy = MAX(y - dilationRadius, 0);
+ const int ey = MIN(y + h + dilationRadius + 1, imgH);
+
+ cv::Rect roi(sx, sy, ex - sx, ey - sy);
+ cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + dilationRadius, 1 + dilationRadius));
+ cv::Mat roiSegMap = segMap(roi);
+ cv::dilate(roiSegMap, roiSegMap, kernel);
+
+ std::vector> contours;
+ cv::findContours(segMap, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE);
+ if (!contours.empty()) {
+ cv::RotatedRect minRect = cv::minAreaRect(contours[0]);
+ cv::Point2f vertices[4];
+ minRect.points(vertices);
+ NSMutableArray *pointsArray = [NSMutableArray arrayWithCapacity:4];
+ for (int j = 0; j < 4; j++) {
+ const CGPoint point = CGPointMake(vertices[j].x, vertices[j].y);
+ [pointsArray addObject:[NSValue valueWithCGPoint:point]];
+ }
+ NSDictionary *dict = @{@"bbox": pointsArray, @"angle": @(minRect.angle)};
+ [detectedBoxes addObject:dict];
+ }
+ }
+
+ return detectedBoxes;
+}
+
++ (NSArray *)restoreBboxRatio:(NSArray *)boxes usingRestoreRatio:(CGFloat)restoreRatio {
+ NSMutableArray *result = [NSMutableArray array];
+ for (NSUInteger i = 0; i < [boxes count]; i++) {
+ NSDictionary *box = boxes[i];
+ NSMutableArray *boxArray = [NSMutableArray arrayWithCapacity:4];
+ for (NSValue *value in box[@"bbox"]) {
+ CGPoint point = [value CGPointValue];
+ point.x *= restoreRatio;
+ point.y *= restoreRatio;
+ [boxArray addObject:[NSValue valueWithCGPoint:point]];
+ }
+ NSDictionary *dict = @{@"bbox": boxArray, @"angle": box[@"angle"]};
+ [result addObject:dict];
+ }
+
+ return result;
+}
+
+/**
+ * This method normalizes angle returned from cv::minAreaRect function which ranges from 0 to 90 degrees.
+ **/
++ (CGFloat)normalizeAngle:(CGFloat)angle {
+ if (angle > 45) {
+ return angle - 90;
+ }
+ return angle;
+}
+
++ (CGPoint)midpointBetweenPoint:(CGPoint)p1 andPoint:(CGPoint)p2 {
+ return CGPointMake((p1.x + p2.x) / 2, (p1.y + p2.y) / 2);
+}
+
++ (CGFloat)distanceFromPoint:(CGPoint)p1 toPoint:(CGPoint)p2 {
+ const CGFloat xDist = (p2.x - p1.x);
+ const CGFloat yDist = (p2.y - p1.y);
+ return sqrt(xDist * xDist + yDist * yDist);
+}
+
++ (CGPoint)centerOfBox:(NSArray *)box {
+ return [self midpointBetweenPoint:[box[0] CGPointValue] andPoint:[box[2] CGPointValue]];
+}
+
++ (CGFloat)maxSideLength:(NSArray *)points {
+ CGFloat maxSideLength = 0;
+ NSInteger numOfPoints = points.count;
+ for (NSInteger i = 0; i < numOfPoints; i++) {
+ const CGPoint currentPoint = [points[i] CGPointValue];
+ const CGPoint nextPoint = [points[(i + 1) % numOfPoints] CGPointValue];
+
+ const CGFloat sideLength = [self distanceFromPoint:currentPoint toPoint:nextPoint];
+ if (sideLength > maxSideLength) {
+ maxSideLength = sideLength;
+ }
+ }
+ return maxSideLength;
+}
+
++ (CGFloat)minSideLength:(NSArray *)points {
+ CGFloat minSideLength = CGFLOAT_MAX;
+ NSInteger numOfPoints = points.count;
+
+ for (NSInteger i = 0; i < numOfPoints; i++) {
+ const CGPoint currentPoint = [points[i] CGPointValue];
+ const CGPoint nextPoint = [points[(i + 1) % numOfPoints] CGPointValue];
+
+ const CGFloat sideLength = [self distanceFromPoint:currentPoint toPoint:nextPoint];
+ if (sideLength < minSideLength) {
+ minSideLength = sideLength;
+ }
+ }
+
+ return minSideLength;
+}
+
++ (CGFloat)calculateMinimalDistanceBetweenBox:(NSArray *)box1 andBox:(NSArray *)box2 {
+ CGFloat minDistance = CGFLOAT_MAX;
+ for (NSValue *value1 in box1) {
+ const CGPoint corner1 = [value1 CGPointValue];
+ for (NSValue *value2 in box2) {
+ const CGPoint corner2 = [value2 CGPointValue];
+ const CGFloat distance = [self distanceFromPoint:corner1 toPoint:corner2];
+ if (distance < minDistance) {
+ minDistance = distance;
+ }
+ }
+ }
+ return minDistance;
+}
+
++ (NSArray *)rotateBox:(NSArray *)box withAngle:(CGFloat)angle {
+ const CGPoint center = [self centerOfBox:box];
+
+ const CGFloat radians = angle * M_PI / 180.0;
+
+ NSMutableArray *rotatedPoints = [NSMutableArray arrayWithCapacity:4];
+ for (NSValue *value in box) {
+ const CGPoint point = [value CGPointValue];
+
+ const CGFloat translatedX = point.x - center.x;
+ const CGFloat translatedY = point.y - center.y;
+
+ const CGFloat rotatedX = translatedX * cos(radians) - translatedY * sin(radians);
+ const CGFloat rotatedY = translatedX * sin(radians) + translatedY * cos(radians);
+
+ const CGPoint rotatedPoint = CGPointMake(rotatedX + center.x, rotatedY + center.y);
+ [rotatedPoints addObject:[NSValue valueWithCGPoint:rotatedPoint]];
+ }
+
+ return rotatedPoints;
+}
+
+/**
+ * Orders a set of points in a clockwise direction starting with the top-left point.
+ *
+ * Process:
+ * 1. It iterates through each CGPoint extracted from the NSValues.
+ * 2. For each point, it calculates the sum (x + y) and difference (y - x) of the coordinates.
+ * 3. Points are classified into:
+ * - Top-left: Minimum sum.
+ * - Bottom-right: Maximum sum.
+ * - Top-right: Minimum difference.
+ * - Bottom-left: Maximum difference.
+ * 4. The points are ordered starting from the top-left in a clockwise manner: top-left, top-right, bottom-right, bottom-left.
+ */
++ (NSArray *)orderPointsClockwise:(NSArray *)points{
+ CGPoint topLeft, topRight, bottomRight, bottomLeft;
+ CGFloat minSum = FLT_MAX;
+ CGFloat maxSum = -FLT_MAX;
+ CGFloat minDiff = FLT_MAX;
+ CGFloat maxDiff = -FLT_MAX;
+
+ for (NSValue *value in points) {
+ const CGPoint pt = [value CGPointValue];
+ const CGFloat sum = pt.x + pt.y;
+ const CGFloat diff = pt.y - pt.x;
+
+ if (sum < minSum) {
+ minSum = sum;
+ topLeft = pt;
+ }
+ if (sum > maxSum) {
+ maxSum = sum;
+ bottomRight = pt;
+ }
+ if (diff < minDiff) {
+ minDiff = diff;
+ topRight = pt;
+ }
+ if (diff > maxDiff) {
+ maxDiff = diff;
+ bottomLeft = pt;
+ }
+ }
+
+ NSArray *rect = @[[NSValue valueWithCGPoint:topLeft],
+ [NSValue valueWithCGPoint:topRight],
+ [NSValue valueWithCGPoint:bottomRight],
+ [NSValue valueWithCGPoint:bottomLeft]];
+
+ return rect;
+}
+
++ (std::vector)pointsFromNSValues:(NSArray *)nsValues {
+ std::vector points;
+ for (NSValue *value in nsValues) {
+ const CGPoint point = [value CGPointValue];
+ points.emplace_back(point.x, point.y);
+ }
+ return points;
+}
+
++ (NSArray *)nsValuesFromPoints:(cv::Point2f *)points count:(int)count {
+ NSMutableArray *nsValues = [[NSMutableArray alloc] initWithCapacity:count];
+ for (int i = 0; i < count; i++) {
+ [nsValues addObject:[NSValue valueWithCGPoint:CGPointMake(points[i].x, points[i].y)]];
+ }
+ return nsValues;
+}
+
++ (NSArray *)mergeRotatedBoxes:(NSArray *)box1 withBox:(NSArray *)box2 {
+ box1 = [self orderPointsClockwise:box1];
+ box2 = [self orderPointsClockwise:box2];
+
+ std::vector points1 = [self pointsFromNSValues:box1];
+ std::vector points2 = [self pointsFromNSValues:box2];
+
+ std::vector allPoints;
+ allPoints.insert(allPoints.end(), points1.begin(), points1.end());
+ allPoints.insert(allPoints.end(), points2.begin(), points2.end());
+
+ std::vector hullIndices;
+ cv::convexHull(allPoints, hullIndices, false);
+
+ std::vector hullPoints;
+ for (int idx : hullIndices) {
+ hullPoints.push_back(allPoints[idx]);
+ }
+
+ cv::RotatedRect minAreaRect = cv::minAreaRect(hullPoints);
+
+ cv::Point2f rectPoints[4];
+ minAreaRect.points(rectPoints);
+
+ return [self nsValuesFromPoints:rectPoints count:4];
+}
+
++ (NSMutableArray *)removeSmallBoxesFromArray:(NSArray *)boxes usingMinSideThreshold:(CGFloat)minSideThreshold maxSideThreshold:(CGFloat)maxSideThreshold {
+ NSMutableArray *filteredBoxes = [NSMutableArray array];
+
+ for (NSDictionary *box in boxes) {
+ const CGFloat maxSideLength = [self maxSideLength:box[@"bbox"]];
+ const CGFloat minSideLength = [self minSideLength:box[@"bbox"]];
+ if (minSideLength > minSideThreshold && maxSideLength > maxSideThreshold) {
+ [filteredBoxes addObject:box];
+ }
+ }
+
+ return filteredBoxes;
+}
+
++ (CGFloat)minimumYFromBox:(NSArray *)box {
+ __block CGFloat minY = CGFLOAT_MAX;
+ [box enumerateObjectsUsingBlock:^(NSValue * _Nonnull obj, NSUInteger idx, BOOL * _Nonnull stop) {
+ const CGPoint pt = [obj CGPointValue];
+ if (pt.y < minY) {
+ minY = pt.y;
+ }
+ }];
+ return minY;
+}
+
+/**
+ * This method calculates the distances between each sequential pair of points in a presumed quadrilateral,
+ * identifies the two shortest sides, and fits a linear model to the midpoints of these sides. It also evaluates
+ * whether the resulting line should be considered vertical based on a predefined threshold for the x-coordinate differences.
+ *
+ * If the line is vertical it is fitted as a function of x = my + c, otherwise as y = mx + c.
+ *
+ * @return A NSDictionary containing:
+ * - "slope": NSNumber representing the slope (m) of the line.
+ * - "intercept": NSNumber representing the line's intercept (c) with y-axis.
+ * - "isVertical": NSNumber (boolean) indicating whether the line is considered vertical.
+ */
++ (NSDictionary *)fitLineToShortestSides:(NSArray *)points {
+ NSMutableArray *sides = [NSMutableArray array];
+ NSMutableArray *midpoints = [NSMutableArray array];
+
+ for (int i = 0; i < 4; i++) {
+ const CGPoint p1 = [points[i] CGPointValue];
+ const CGPoint p2 = [points[(i + 1) % 4] CGPointValue];
+
+ const CGFloat sideLength = [self distanceFromPoint:p1 toPoint:p2];
+ [sides addObject:@{@"length": @(sideLength), @"index": @(i)}];
+ [midpoints addObject:[NSValue valueWithCGPoint:[self midpointBetweenPoint:p1 andPoint:p2]]];
+ }
+
+ [sides sortUsingDescriptors:@[[NSSortDescriptor sortDescriptorWithKey:@"length" ascending:YES]]];
+
+ const CGPoint midpoint1 = [midpoints[[sides[0][@"index"] intValue]] CGPointValue];
+ const CGPoint midpoint2 = [midpoints[[sides[1][@"index"] intValue]] CGPointValue];
+ const CGFloat dx = fabs(midpoint2.x - midpoint1.x);
+
+ CGFloat m, c;
+ BOOL isVertical;
+
+ std::vector cvMidPoints = {cv::Point2f(midpoint1.x, midpoint1.y), cv::Point2f(midpoint2.x, midpoint2.y)};
+ cv::Vec4f line;
+
+ if (dx < verticalLineThreshold) {
+ for (auto &pt : cvMidPoints) std::swap(pt.x, pt.y);
+ cv::fitLine(cvMidPoints, line, cv::DIST_L2, 0, 0.01, 0.01);
+ m = line[1] / line[0];
+ c = line[3] - m * line[2];
+ isVertical = YES;
+ } else {
+ cv::fitLine(cvMidPoints, line, cv::DIST_L2, 0, 0.01, 0.01);
+ m = line[1] / line[0];
+ c = line[3] - m * line[2];
+ isVertical = NO;
+ }
+
+ return @{@"slope": @(m), @"intercept": @(c), @"isVertical": @(isVertical)};
+}
+
+/**
+ * This method assesses each box from a provided array, checks its center against the center of a "current box",
+ * and evaluates its alignment with a specified line equation. The function specifically searches for the box
+ * whose center is closest to the current box, that has not been ignored, and fits within a defined distance from the line.
+ *
+ * @param boxes An NSArray of NSDictionary objects where each dictionary represents a box with keys "bbox" and "angle".
+ * "bbox" is an NSArray of NSValue objects each encapsulating CGPoint that define the box vertices.
+ * "angle" is a NSNumber representing the box's rotation angle.
+ * @param ignoredIdxs An NSSet of NSNumber objects representing indices of boxes to ignore in the evaluation.
+ * @param currentBox An NSArray of NSValue objects encapsulating CGPoints representing the current box to compare against.
+ * @param isVertical A pointer to a BOOL indicating if the line to compare distance to is vertical.
+ * @param m The slope (gradient) of the line against which the box's alignment is checked.
+ * @param c The y-intercept of the line equation y = mx + c.
+ * @param centerThreshold A multiplier to determine the threshold for the distance between the box's center and the line.
+ *
+ * @return A NSDictionary containing:
+ * - "idx" : NSNumber indicating the index of the found box in the original NSArray.
+ * - "boxHeight" : NSNumber representing the shortest side length of the found box.
+ * Returns nil if no suitable box is found.
+ */
++ (NSDictionary *)findClosestBox:(NSArray *)boxes
+ ignoredIdxs:(NSSet *)ignoredIdxs
+ currentBox:(NSArray *)currentBox
+ isVertical:(BOOL)isVertical
+ m:(CGFloat)m
+ c:(CGFloat)c
+ centerThreshold:(CGFloat)centerThreshold
+{
+ CGFloat smallestDistance = CGFLOAT_MAX;
+ NSInteger idx = -1;
+ CGFloat boxHeight = 0;
+ const CGPoint centerOfCurrentBox = [self centerOfBox:currentBox];
+
+ for (NSUInteger i = 0; i < boxes.count; i++) {
+ if ([ignoredIdxs containsObject:@(i)]) {
+ continue;
+ }
+ NSArray *bbox = boxes[i][@"bbox"];
+ const CGPoint centerOfProcessedBox = [self centerOfBox:bbox];
+ const CGFloat distanceBetweenCenters = [self distanceFromPoint:centerOfCurrentBox toPoint:centerOfProcessedBox];
+
+ if (distanceBetweenCenters >= smallestDistance) {
+ continue;
+ }
+
+ boxHeight = [self minSideLength:bbox];
+
+ const CGFloat lineDistance = (isVertical ?
+ fabs(centerOfProcessedBox.x - (m * centerOfProcessedBox.y + c)) :
+ fabs(centerOfProcessedBox.y - (m * centerOfProcessedBox.x + c)));
+
+ if (lineDistance < boxHeight * centerThreshold) {
+ idx = i;
+ smallestDistance = distanceBetweenCenters;
+ }
+ }
+
+ return idx != -1 ? @{@"idx": @(idx), @"boxHeight": @(boxHeight)} : nil;
+}
+
+/**
+ * This method processes an array of text box dictionaries, each containing details about individual text boxes,
+ * and attempts to group and merge these boxes based on specified criteria including proximity, alignment,
+ * and size thresholds. It prioritizes merging of boxes that are aligned closely in angle, are near each other,
+ * and whose sizes are compatible based on the given thresholds.
+ *
+ * @param boxes An array of NSDictionary objects where each dictionary represents a text box. Each dictionary must have
+ * at least a "bbox" key with an NSArray of NSValue wrapping CGPoints defining the box vertices,
+ * and an "angle" key indicating the orientation of the box.
+ * @param centerThreshold A CGFloat representing the threshold for considering the distance between center and fitted line.
+ * @param distanceThreshold A CGFloat that defines the maximum allowed distance between boxes for them to be considered for merging.
+ * @param heightThreshold A CGFloat representing the maximum allowed difference in height between boxes for merging.
+ * @param minSideThreshold An int that defines the minimum dimension threshold to filter out small boxes after grouping.
+ * @param maxSideThreshold An int that specifies the maximum dimension threshold for filtering boxes post-grouping.
+ * @param maxWidth An int that represents the maximum width allowable for a merged box.
+ *
+ * @return An NSArray of NSDictionary objects representing the merged boxes. Each dictionary contains:
+ * - "bbox": An NSArray of NSValue each containing a CGPoint that defines the vertices of the merged box.
+ * - "angle": NSNumber representing the computed orientation of the merged box.
+ *
+ * Processing Steps:
+ * 1. Sort initial boxes based on their maximum side length.
+ * 2. Sequentially merge boxes considering alignment, proximity, and size compatibility.
+ * 3. Post-processing to remove any boxes that are too small or exceed max side criteria.
+ * 4. Sort the final array of boxes by their vertical positions.
+ */
++ (NSArray *)groupTextBoxes:(NSMutableArray *)boxes
+ centerThreshold:(CGFloat)centerThreshold
+ distanceThreshold:(CGFloat)distanceThreshold
+ heightThreshold:(CGFloat)heightThreshold
+ minSideThreshold:(int)minSideThreshold
+ maxSideThreshold:(int)maxSideThreshold
+ maxWidth:(int)maxWidth
+{
+ // Sort boxes based on their maximum side length
+ boxes = [boxes sortedArrayUsingComparator:^NSComparisonResult(NSDictionary *obj1, NSDictionary *obj2) {
+ const CGFloat maxLen1 = [self maxSideLength:obj1[@"bbox"]];
+ const CGFloat maxLen2 = [self maxSideLength:obj2[@"bbox"]];
+ return (maxLen1 < maxLen2) ? NSOrderedDescending : (maxLen1 > maxLen2) ? NSOrderedAscending : NSOrderedSame;
+ }].mutableCopy;
+
+ NSMutableArray *mergedArray = [NSMutableArray array];
+ CGFloat lineAngle;
+ while (boxes.count > 0) {
+ NSMutableDictionary *currentBox = [boxes[0] mutableCopy];
+ CGFloat normalizedAngle = [self normalizeAngle:[currentBox[@"angle"] floatValue]];
+ [boxes removeObjectAtIndex:0];
+ NSMutableArray *ignoredIdxs = [NSMutableArray array];
+
+ while (YES) {
+ //Find all aligned boxes and merge them until max_size is reached or no more boxes can be merged
+ NSDictionary *fittedLine = [self fitLineToShortestSides:currentBox[@"bbox"]];
+ const CGFloat slope = [fittedLine[@"slope"] floatValue];
+ const CGFloat intercept = [fittedLine[@"intercept"] floatValue];
+ const BOOL isVertical = [fittedLine[@"isVertical"] boolValue];
+
+ lineAngle = atan(slope) * 180 / M_PI;
+ if (isVertical){
+ lineAngle = -90;
+ }
+
+ NSDictionary *closestBoxInfo = [self findClosestBox:boxes ignoredIdxs:[NSSet setWithArray:ignoredIdxs] currentBox:currentBox[@"bbox"] isVertical:isVertical m:slope c:intercept centerThreshold:centerThreshold];
+ if (closestBoxInfo == nil) break;
+
+ NSInteger candidateIdx = [closestBoxInfo[@"idx"] integerValue];
+ NSMutableDictionary *candidateBox = [boxes[candidateIdx] mutableCopy];
+ const CGFloat candidateHeight = [closestBoxInfo[@"boxHeight"] floatValue];
+
+ if (([candidateBox[@"angle"] isEqual: @90] && !isVertical) || ([candidateBox[@"angle"] isEqual: @0] && isVertical)) {
+ candidateBox[@"bbox"] = [self rotateBox:candidateBox[@"bbox"] withAngle:normalizedAngle];
+ }
+
+ const CGFloat minDistance = [self calculateMinimalDistanceBetweenBox:candidateBox[@"bbox"] andBox:currentBox[@"bbox"]];
+ const CGFloat mergedHeight = [self minSideLength:currentBox[@"bbox"]];
+ if (minDistance < distanceThreshold * candidateHeight && fabs(mergedHeight - candidateHeight) < candidateHeight * heightThreshold) {
+ currentBox[@"bbox"] = [self mergeRotatedBoxes:currentBox[@"bbox"] withBox:candidateBox[@"bbox"]];
+ [boxes removeObjectAtIndex:candidateIdx];
+ [ignoredIdxs removeAllObjects];
+ if ([self maxSideLength:currentBox[@"bbox"]] > maxWidth){
+ break;
+ }
+ } else {
+ [ignoredIdxs addObject:@(candidateIdx)];
+ }
+ }
+
+ [mergedArray addObject:@{@"bbox" : currentBox[@"bbox"], @"angle" : @(lineAngle)}];
+ }
+
+ // Remove small boxes and sort by vertical
+ mergedArray = [self removeSmallBoxesFromArray:mergedArray usingMinSideThreshold:minSideThreshold maxSideThreshold:maxSideThreshold];
+
+ NSArray *sortedBoxes = [mergedArray sortedArrayUsingComparator:^NSComparisonResult(NSDictionary *obj1, NSDictionary *obj2) {
+ NSArray *coords1 = obj1[@"bbox"];
+ NSArray *coords2 = obj2[@"bbox"];
+ const CGFloat minY1 = [self minimumYFromBox:coords1];
+ const CGFloat minY2 = [self minimumYFromBox:coords2];
+ return (minY1 < minY2) ? NSOrderedAscending : (minY1 > minY2) ? NSOrderedDescending : NSOrderedSame;
+ }];
+
+ return sortedBoxes;
+}
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/utils/OCRUtils.h b/ios/RnExecutorch/models/ocr/utils/OCRUtils.h
new file mode 100644
index 00000000..0304ad37
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/utils/OCRUtils.h
@@ -0,0 +1,7 @@
+#import
+
+@interface OCRUtils : NSObject
+
++ (cv::Mat)resizeWithPadding:(cv::Mat)img desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight;
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/utils/OCRUtils.mm b/ios/RnExecutorch/models/ocr/utils/OCRUtils.mm
new file mode 100644
index 00000000..3bec6244
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/utils/OCRUtils.mm
@@ -0,0 +1,49 @@
+#import "OCRUtils.h"
+
+@implementation OCRUtils
+
++ (cv::Mat)resizeWithPadding:(cv::Mat)img desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight {
+ const int height = img.rows;
+ const int width = img.cols;
+ const float heightRatio = (float)desiredHeight / height;
+ const float widthRatio = (float)desiredWidth / width;
+ const float resizeRatio = MIN(heightRatio, widthRatio);
+
+ const int newWidth = width * resizeRatio;
+ const int newHeight = height * resizeRatio;
+
+ cv::Mat resizedImg;
+ cv::resize(img, resizedImg, cv::Size(newWidth, newHeight), 0, 0, cv::INTER_AREA);
+
+ const int cornerPatchSize = MAX(1, MIN(height, width) / 30);
+ std::vector corners = {
+ img(cv::Rect(0, 0, cornerPatchSize, cornerPatchSize)),
+ img(cv::Rect(width - cornerPatchSize, 0, cornerPatchSize, cornerPatchSize)),
+ img(cv::Rect(0, height - cornerPatchSize, cornerPatchSize, cornerPatchSize)),
+ img(cv::Rect(width - cornerPatchSize, height - cornerPatchSize, cornerPatchSize, cornerPatchSize))
+ };
+
+ cv::Scalar backgroundScalar = cv::mean(corners[0]);
+ for (int i = 1; i < corners.size(); i++) {
+ backgroundScalar += cv::mean(corners[i]);
+ }
+ backgroundScalar /= (double)corners.size();
+
+ backgroundScalar[0] = cvFloor(backgroundScalar[0]);
+ backgroundScalar[1] = cvFloor(backgroundScalar[1]);
+ backgroundScalar[2] = cvFloor(backgroundScalar[2]);
+
+ const int deltaW = desiredWidth - newWidth;
+ const int deltaH = desiredHeight - newHeight;
+ const int top = deltaH / 2;
+ const int bottom = deltaH - top;
+ const int left = deltaW / 2;
+ const int right = deltaW - left;
+
+ cv::Mat centeredImg;
+ cv::copyMakeBorder(resizedImg, centeredImg, top, bottom, left, right, cv::BORDER_CONSTANT, backgroundScalar);
+
+ return centeredImg;
+}
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.h b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.h
new file mode 100644
index 00000000..337cdc9f
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.h
@@ -0,0 +1,17 @@
+#import
+
+@interface RecognizerUtils : NSObject
+
++ (CGFloat)calculateRatio:(int)width height:(int)height;
++ (cv::Mat)computeRatioAndResize:(cv::Mat)img width:(int)width height:(int)height modelHeight:(int)modelHeight;
++ (cv::Mat)normalizeForRecognizer:(cv::Mat)image adjustContrast:(double)adjustContrast;
++ (cv::Mat)adjustContrastGrey:(cv::Mat)img target:(double)target;
++ (cv::Mat)divideMatrix:(cv::Mat)matrix byVector:(NSArray *)vector;
++ (cv::Mat)softmax:(cv::Mat)inputs;
++ (NSDictionary *)calculateResizeRatioAndPaddings:(int)width height:(int)height desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight;
++ (cv::Mat)getCroppedImage:(NSDictionary *)box image:(cv::Mat)image modelHeight:(int)modelHeight;
++ (NSMutableArray *)sumProbabilityRows:(cv::Mat)probabilities modelOutputHeight:(int)modelOutputHeight;
++ (NSArray *)findMaxValuesAndIndices:(cv::Mat)probabilities;
++ (double)computeConfidenceScore:(NSArray *)valuesArray indicesArray:(NSArray *)indicesArray;
+
+@end
diff --git a/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm
new file mode 100644
index 00000000..74048e20
--- /dev/null
+++ b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm
@@ -0,0 +1,202 @@
+#import "OCRUtils.h"
+#import "RecognizerUtils.h"
+
+@implementation RecognizerUtils
+
++ (CGFloat)calculateRatio:(int)width height:(int)height {
+ CGFloat ratio = (CGFloat)width / (CGFloat)height;
+ if (ratio < 1.0) {
+ ratio = 1.0 / ratio;
+ }
+ return ratio;
+}
+
++ (cv::Mat)computeRatioAndResize:(cv::Mat)img width:(int)width height:(int)height modelHeight:(int)modelHeight {
+ CGFloat ratio = (CGFloat)width / (CGFloat)height;
+ if (ratio < 1.0) {
+ ratio = [self calculateRatio:width height:height];
+ cv::resize(img, img, cv::Size(modelHeight, (int)(modelHeight * ratio)), 0, 0, cv::INTER_LANCZOS4);
+ } else {
+ cv::resize(img, img, cv::Size((int)(modelHeight * ratio), modelHeight), 0, 0, cv::INTER_LANCZOS4);
+ }
+ return img;
+}
+
++ (cv::Mat)adjustContrastGrey:(cv::Mat)img target:(double)target {
+ double contrast = 0.0;
+ int high = 0;
+ int low = 255;
+
+ for (int i = 0; i < img.rows; ++i) {
+ for (int j = 0; j < img.cols; ++j) {
+ uchar pixel = img.at(i, j);
+ high = MAX(high, pixel);
+ low = MIN(low, pixel);
+ }
+ }
+ contrast = (high - low) / 255.0;
+
+ if (contrast < target) {
+ const double ratio = 200.0 / MAX(10, high - low);
+ img.convertTo(img, CV_32F);
+ img = ((img - low + 25) * ratio);
+
+ cv::threshold(img, img, 255, 255, cv::THRESH_TRUNC);
+ cv::threshold(img, img, 0, 0, cv::THRESH_TOZERO);
+
+ img.convertTo(img, CV_8U);
+ }
+
+ return img;
+}
+
++ (cv::Mat)normalizeForRecognizer:(cv::Mat)image adjustContrast:(double)adjustContrast {
+ if (adjustContrast > 0) {
+ image = [self adjustContrastGrey:image target:adjustContrast];
+ }
+
+ int desiredWidth = 128;
+ if (image.cols >= 512) {
+ desiredWidth = 512;
+ } else if (image.cols >= 256) {
+ desiredWidth = 256;
+ }
+
+ image = [OCRUtils resizeWithPadding:image desiredWidth:desiredWidth desiredHeight:64];
+
+ image.convertTo(image, CV_32F, 1.0 / 255.0);
+ image = (image - 0.5) * 2.0;
+
+ return image;
+}
+
++ (cv::Mat)divideMatrix:(cv::Mat)matrix byVector:(NSArray *)vector {
+ cv::Mat result = matrix.clone();
+
+ for (int i = 0; i < matrix.rows; i++) {
+ const float divisor = [vector[i] floatValue];
+ for (int j = 0; j < matrix.cols; j++) {
+ result.at(i, j) /= divisor;
+ }
+ }
+
+ return result;
+}
+
++ (cv::Mat)softmax:(cv::Mat) inputs {
+ cv::Mat maxVal;
+ cv::reduce(inputs, maxVal, 1, cv::REDUCE_MAX, CV_32F);
+ cv::Mat expInputs;
+ cv::exp(inputs - cv::repeat(maxVal, 1, inputs.cols), expInputs);
+ cv::Mat sumExp;
+ cv::reduce(expInputs, sumExp, 1, cv::REDUCE_SUM, CV_32F);
+ cv::Mat softmaxOutput = expInputs / cv::repeat(sumExp, 1, inputs.cols);
+ return softmaxOutput;
+}
+
++ (NSDictionary *)calculateResizeRatioAndPaddings:(int)width height:(int)height desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight {
+ const float newRatioH = (float)desiredHeight / height;
+ const float newRatioW = (float)desiredWidth / width;
+ float resizeRatio = MIN(newRatioH, newRatioW);
+ const int newWidth = width * resizeRatio;
+ const int newHeight = height * resizeRatio;
+ const int deltaW = desiredWidth - newWidth;
+ const int deltaH = desiredHeight - newHeight;
+ const int top = deltaH / 2;
+ const int left = deltaW / 2;
+ const float heightRatio = (float)height / desiredHeight;
+ const float widthRatio = (float)width / desiredWidth;
+
+ resizeRatio = MAX(heightRatio, widthRatio);
+
+ return @{
+ @"resizeRatio": @(resizeRatio),
+ @"top": @(top),
+ @"left": @(left),
+ };
+}
+
++ (cv::Mat)getCroppedImage:(NSDictionary *)box image:(cv::Mat)image modelHeight:(int)modelHeight {
+ NSArray *coords = box[@"bbox"];
+ const CGFloat angle = [box[@"angle"] floatValue];
+
+ std::vector points;
+ for (NSValue *value in coords) {
+ const CGPoint point = [value CGPointValue];
+ points.emplace_back(static_cast(point.x), static_cast(point.y));
+ }
+
+ cv::RotatedRect rotatedRect = cv::minAreaRect(points);
+
+ cv::Point2f imageCenter = cv::Point2f(image.cols / 2.0, image.rows / 2.0);
+ cv::Mat rotationMatrix = cv::getRotationMatrix2D(imageCenter, angle, 1.0);
+ cv::Mat rotatedImage;
+ cv::warpAffine(image, rotatedImage, rotationMatrix, image.size(), cv::INTER_LINEAR);
+ cv::Point2f rectPoints[4];
+ rotatedRect.points(rectPoints);
+ std::vector transformedPoints(4);
+ cv::Mat rectMat(4, 2, CV_32FC2, rectPoints);
+ cv::transform(rectMat, rectMat, rotationMatrix);
+
+ for (int i = 0; i < 4; ++i) {
+ transformedPoints[i] = rectPoints[i];
+ }
+
+ cv::Rect boundingBox = cv::boundingRect(transformedPoints);
+ boundingBox &= cv::Rect(0, 0, rotatedImage.cols, rotatedImage.rows);
+ cv::Mat croppedImage = rotatedImage(boundingBox);
+ if (boundingBox.width == 0 || boundingBox.height == 0){
+ croppedImage = cv::Mat().empty();
+
+ return croppedImage;
+ }
+
+ croppedImage = [self computeRatioAndResize:croppedImage width:boundingBox.width height:boundingBox.height modelHeight:modelHeight];
+
+ return croppedImage;
+}
+
++ (NSMutableArray *)sumProbabilityRows:(cv::Mat)probabilities modelOutputHeight:(int)modelOutputHeight {
+ NSMutableArray *predsNorm = [NSMutableArray arrayWithCapacity:probabilities.rows];
+ for (int i = 0; i < probabilities.rows; i++) {
+ float sum = 0.0;
+ for (int j = 0; j < modelOutputHeight; j++) {
+ sum += probabilities.at(i, j);
+ }
+ [predsNorm addObject:@(sum)];
+ }
+ return predsNorm;
+}
+
++ (NSArray *)findMaxValuesAndIndices:(cv::Mat)probabilities {
+ NSMutableArray *valuesArray = [NSMutableArray array];
+ NSMutableArray *indicesArray = [NSMutableArray array];
+ for (int i = 0; i < probabilities.rows; i++) {
+ double maxVal = 0;
+ cv::Point maxLoc;
+ cv::minMaxLoc(probabilities.row(i), NULL, &maxVal, NULL, &maxLoc);
+ [valuesArray addObject:@(maxVal)];
+ [indicesArray addObject:@(maxLoc.x)];
+ }
+ return @[valuesArray, indicesArray];
+}
+
++ (double)computeConfidenceScore:(NSArray *)valuesArray indicesArray:(NSArray *)indicesArray {
+ NSMutableArray *predsMaxProb = [NSMutableArray array];
+ for (NSUInteger index = 0; index < indicesArray.count; index++) {
+ NSNumber *indicator = indicesArray[index];
+ if ([indicator intValue] != 0) {
+ [predsMaxProb addObject:valuesArray[index]];
+ }
+ }
+ if (predsMaxProb.count == 0) {
+ [predsMaxProb addObject:@(0)];
+ }
+ double product = 1.0;
+ for (NSNumber *prob in predsMaxProb) {
+ product *= [prob doubleValue];
+ }
+ return pow(product, 2.0 / sqrt(predsMaxProb.count));
+}
+
+@end
diff --git a/ios/RnExecutorch/utils/Fetcher.h b/ios/RnExecutorch/utils/Fetcher.h
index 9d75a574..02334bee 100644
--- a/ios/RnExecutorch/utils/Fetcher.h
+++ b/ios/RnExecutorch/utils/Fetcher.h
@@ -4,7 +4,8 @@
enum class ResourceType
{
MODEL,
- TOKENIZER
+ TOKENIZER,
+ TXT
};
inline constexpr unsigned int STATUS_OK = 200;
diff --git a/ios/RnExecutorch/utils/Fetcher.mm b/ios/RnExecutorch/utils/Fetcher.mm
index 27ae238e..86cc420a 100644
--- a/ios/RnExecutorch/utils/Fetcher.mm
+++ b/ios/RnExecutorch/utils/Fetcher.mm
@@ -46,6 +46,8 @@ + (BOOL) hasValidExtension:(NSString *)fileName resourceType:(ResourceType)resou
return [fileName hasSuffix:@".bin"];
case ResourceType::MODEL:
return [fileName hasSuffix:@".pte"];
+ case ResourceType::TXT:
+ return [fileName hasSuffix:@".txt"];
default:
return NO;
}
diff --git a/ios/RnExecutorch/utils/ImageProcessor.h b/ios/RnExecutorch/utils/ImageProcessor.h
index 4bb7034e..c65182d0 100644
--- a/ios/RnExecutorch/utils/ImageProcessor.h
+++ b/ios/RnExecutorch/utils/ImageProcessor.h
@@ -3,8 +3,13 @@
@interface ImageProcessor : NSObject
++ (NSArray *)matToNSArray:(const cv::Mat &)mat
+ mean:(cv::Scalar)mean
+ variance:(cv::Scalar)variance;
+ (NSArray *)matToNSArray:(const cv::Mat &)mat;
+ (cv::Mat)arrayToMat:(NSArray *)array width:(int)width height:(int)height;
++ (cv::Mat)arrayToMatGray:(NSArray *)array width:(int)width height:(int)height;
++ (NSArray *)matToNSArrayGray:(const cv::Mat &)mat;
+ (NSString *)saveToTempFile:(const cv::Mat &)image;
+ (cv::Mat)readImage:(NSString *)source;
diff --git a/ios/RnExecutorch/utils/ImageProcessor.mm b/ios/RnExecutorch/utils/ImageProcessor.mm
index feab17f6..a8617c26 100644
--- a/ios/RnExecutorch/utils/ImageProcessor.mm
+++ b/ios/RnExecutorch/utils/ImageProcessor.mm
@@ -4,6 +4,12 @@
@implementation ImageProcessor
+ (NSArray *)matToNSArray:(const cv::Mat &)mat {
+ return [ImageProcessor matToNSArray:mat mean:cv::Scalar(0.0, 0.0, 0.0) variance:cv::Scalar(1.0, 1.0, 1.0)];
+}
+
++ (NSArray *)matToNSArray:(const cv::Mat &)mat
+ mean:(cv::Scalar)mean
+ variance:(cv::Scalar)variance {
int pixelCount = mat.cols * mat.rows;
NSMutableArray *floatArray = [[NSMutableArray alloc] initWithCapacity:pixelCount * 3];
for (NSUInteger k = 0; k < pixelCount * 3; k++) {
@@ -14,14 +20,27 @@ + (NSArray *)matToNSArray:(const cv::Mat &)mat {
int row = i / mat.cols;
int col = i % mat.cols;
cv::Vec3b pixel = mat.at(row, col);
- floatArray[0 * pixelCount + i] = @(pixel[2] / 255.0f);
- floatArray[1 * pixelCount + i] = @(pixel[1] / 255.0f);
- floatArray[2 * pixelCount + i] = @(pixel[0] / 255.0f);
+ floatArray[0 * pixelCount + i] = @((pixel[0] - mean[0] * 255.0) / (variance[0] * 255.0));
+ floatArray[1 * pixelCount + i] = @((pixel[1] - mean[1] * 255.0) / (variance[1] * 255.0));
+ floatArray[2 * pixelCount + i] = @((pixel[2] - mean[2] * 255.0) / (variance[2] * 255.0));
}
return floatArray;
}
++ (NSArray *)matToNSArrayGray:(const cv::Mat &)mat {
+ NSMutableArray *pixelArray = [[NSMutableArray alloc] initWithCapacity:mat.cols * mat.rows];
+
+ for (int row = 0; row < mat.rows; row++) {
+ for (int col = 0; col < mat.cols; col++) {
+ float pixelValue = mat.at(row, col);
+ [pixelArray addObject:@(pixelValue)];
+ }
+ }
+
+ return pixelArray;
+}
+
+ (cv::Mat)arrayToMat:(NSArray *)array width:(int)width height:(int)height {
cv::Mat mat(height, width, CV_8UC3);
@@ -42,6 +61,20 @@ + (NSArray *)matToNSArray:(const cv::Mat &)mat {
return mat;
}
++ (cv::Mat)arrayToMatGray:(NSArray *)array width:(int)width height:(int)height {
+ cv::Mat mat(height, width, CV_32F);
+
+ int pixelCount = width * height;
+ for (int i = 0; i < pixelCount; i++) {
+ int row = i / width;
+ int col = i % width;
+ float value = [array[i] floatValue];
+ mat.at(row, col) = value;
+ }
+
+ return mat;
+}
+
+ (NSString *)saveToTempFile:(const cv::Mat&)image {
NSString *uniqueID = [[NSUUID UUID] UUIDString];
NSString *filename = [NSString stringWithFormat:@"rn_executorch_%@.png", uniqueID];
@@ -65,9 +98,9 @@ + (NSString *)saveToTempFile:(const cv::Mat&)image {
//base64
NSArray *parts = [source componentsSeparatedByString:@","];
if ([parts count] < 2) {
- @throw [NSException exceptionWithName:@"readImage_error"
- reason:[NSString stringWithFormat:@"%ld", (long)InvalidArgument]
- userInfo:nil];
+ @throw [NSException exceptionWithName:@"readImage_error"
+ reason:[NSString stringWithFormat:@"%ld", (long)InvalidArgument]
+ userInfo:nil];
}
NSString *encodedString = parts[1];
NSData *data = [[NSData alloc] initWithBase64EncodedString:encodedString options:NSDataBase64DecodingIgnoreUnknownCharacters];
diff --git a/src/Error.ts b/src/Error.ts
index 76785639..955b62a9 100644
--- a/src/Error.ts
+++ b/src/Error.ts
@@ -4,6 +4,7 @@ export enum ETError {
ModuleNotLoaded = 0x66,
FileWriteFailed = 0x67,
ModelGenerating = 0x68,
+ LanguageNotSupported = 0x69,
InvalidModelSource = 0xff,
// ExecuTorch mapped errors
diff --git a/src/OCR.ts b/src/OCR.ts
new file mode 100644
index 00000000..17c4aafc
--- /dev/null
+++ b/src/OCR.ts
@@ -0,0 +1,114 @@
+import { useEffect, useState } from 'react';
+import { ResourceSource } from './types/common';
+import { OCR } from './native/RnExecutorchModules';
+import { ETError, getError } from './Error';
+import { Image } from 'react-native';
+import { OCRDetection } from './types/ocr';
+import { symbols } from './constants/ocr/symbols';
+import { languageDicts } from './constants/ocr/languageDicts';
+
+interface OCRModule {
+ error: string | null;
+ isReady: boolean;
+ isGenerating: boolean;
+ forward: (input: string) => Promise;
+}
+
+const getResourcePath = (source: ResourceSource) => {
+ if (typeof source === 'number') {
+ return Image.resolveAssetSource(source).uri;
+ }
+ return source;
+};
+
+export const useOCR = ({
+ detectorSource,
+ recognizerSources,
+ language = 'en',
+}: {
+ detectorSource: ResourceSource;
+ recognizerSources: {
+ recognizerLarge: ResourceSource;
+ recognizerMedium: ResourceSource;
+ recognizerSmall: ResourceSource;
+ };
+ language?: string;
+}): OCRModule => {
+ const [error, setError] = useState(null);
+ const [isReady, setIsReady] = useState(false);
+ const [isGenerating, setIsGenerating] = useState(false);
+
+ useEffect(() => {
+ const loadModel = async () => {
+ if (!detectorSource || Object.keys(recognizerSources).length === 0)
+ return;
+
+ const detectorPath = getResourcePath(detectorSource);
+ const recognizerPaths = {} as {
+ recognizerLarge: string;
+ recognizerMedium: string;
+ recognizerSmall: string;
+ };
+
+ if (!symbols[language] || !languageDicts[language]) {
+ setError(getError(ETError.LanguageNotSupported));
+ return;
+ }
+
+ for (const key in recognizerSources) {
+ if (recognizerSources.hasOwnProperty(key)) {
+ recognizerPaths[key as keyof typeof recognizerPaths] =
+ getResourcePath(
+ recognizerSources[key as keyof typeof recognizerSources]
+ );
+ }
+ }
+
+ const languageDictPath = getResourcePath(languageDicts[language]);
+
+ try {
+ setIsReady(false);
+ await OCR.loadModule(
+ detectorPath,
+ recognizerPaths.recognizerLarge,
+ recognizerPaths.recognizerMedium,
+ recognizerPaths.recognizerSmall,
+ symbols[language],
+ languageDictPath
+ );
+ setIsReady(true);
+ } catch (e) {
+ setError(getError(e));
+ }
+ };
+
+ loadModel();
+ // eslint-disable-next-line react-hooks/exhaustive-deps
+ }, [detectorSource, language, JSON.stringify(recognizerSources)]);
+
+ const forward = async (input: string) => {
+ if (!isReady) {
+ throw new Error(getError(ETError.ModuleNotLoaded));
+ }
+ if (isGenerating) {
+ throw new Error(getError(ETError.ModelGenerating));
+ }
+
+ try {
+ setIsGenerating(true);
+ const output = await OCR.forward(input);
+ return output;
+ } catch (e) {
+ throw new Error(getError(e));
+ } finally {
+ setIsGenerating(false);
+ }
+ };
+
+ return {
+ error,
+ isReady,
+ isGenerating,
+ forward,
+ };
+};
diff --git a/src/constants/ocr/languageDicts.ts b/src/constants/ocr/languageDicts.ts
new file mode 100644
index 00000000..fcd189b5
--- /dev/null
+++ b/src/constants/ocr/languageDicts.ts
@@ -0,0 +1,4 @@
+export const languageDicts: { [key: string]: string } = {
+ en: 'https://huggingface.co/nklockiewicz/ocr/resolve/main/en.txt',
+ pl: 'https://huggingface.co/nklockiewicz/ocr/resolve/main/pl.txt',
+};
diff --git a/src/constants/ocr/symbols.ts b/src/constants/ocr/symbols.ts
new file mode 100644
index 00000000..229c0613
--- /dev/null
+++ b/src/constants/ocr/symbols.ts
@@ -0,0 +1,4 @@
+export const symbols: { [key: string]: string } = {
+ en: '0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ €ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz',
+ pl: ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ªÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿĀāĂ㥹ĆćČčĎďĐđĒēĖėĘęĚěĞğĨĩĪīĮįİıĶķĹĺĻļĽľŁłŃńŅņŇňŒœŔŕŘřŚśŞşŠšŤťŨũŪūŮůŲųŸŹźŻżŽžƏƠơƯưȘșȚțə̇ḌḍḶḷṀṁṂṃṄṅṆṇṬṭẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ€',
+};
diff --git a/src/index.tsx b/src/index.tsx
index 74cfd13e..429b5061 100644
--- a/src/index.tsx
+++ b/src/index.tsx
@@ -1,7 +1,9 @@
export * from './ETModule';
export * from './LLM';
+export * from './OCR';
export * from './constants/modelUrls';
export * from './models/Classification';
export * from './models/ObjectDetection';
export * from './models/StyleTransfer';
export * from './types/object_detection';
+export * from './types/ocr';
diff --git a/src/native/NativeOCR.ts b/src/native/NativeOCR.ts
new file mode 100644
index 00000000..305bf012
--- /dev/null
+++ b/src/native/NativeOCR.ts
@@ -0,0 +1,17 @@
+import type { TurboModule } from 'react-native';
+import { TurboModuleRegistry } from 'react-native';
+import { OCRDetection } from '../types/ocr';
+
+export interface Spec extends TurboModule {
+ loadModule(
+ detectorSource: string,
+ recognizerSourceLarge: string,
+ recognizerSourceMedium: string,
+ recognizerSourceSmall: string,
+ symbols: string,
+ languageDictPath: string
+ ): Promise;
+ forward(input: string): Promise;
+}
+
+export default TurboModuleRegistry.get('OCR');
diff --git a/src/native/RnExecutorchModules.ts b/src/native/RnExecutorchModules.ts
index 8a80b595..925ec09b 100644
--- a/src/native/RnExecutorchModules.ts
+++ b/src/native/RnExecutorchModules.ts
@@ -71,6 +71,19 @@ const StyleTransfer = StyleTransferSpec
}
);
+const OCRSpec = require('./NativeOCR').default;
+
+const OCR = OCRSpec
+ ? OCRSpec
+ : new Proxy(
+ {},
+ {
+ get() {
+ throw new Error(LINKING_ERROR);
+ },
+ }
+ );
+
class _ObjectDetectionModule {
async forward(input: string) {
return await ObjectDetection.forward(input);
@@ -120,6 +133,7 @@ export {
Classification,
ObjectDetection,
StyleTransfer,
+ OCR,
_ETModule,
_ClassificationModule,
_StyleTransferModule,
diff --git a/src/types/ocr.ts b/src/types/ocr.ts
new file mode 100644
index 00000000..f5f2e6d3
--- /dev/null
+++ b/src/types/ocr.ts
@@ -0,0 +1,10 @@
+export interface OCRDetection {
+ bbox: OCRBbox[];
+ text: string;
+ score: number;
+}
+
+export interface OCRBbox {
+ x: number;
+ y: number;
+}
diff --git a/src/useModule.ts b/src/useModule.ts
index 66c2fd49..45e58c7b 100644
--- a/src/useModule.ts
+++ b/src/useModule.ts
@@ -1,7 +1,7 @@
import { useEffect, useState } from 'react';
import { Image } from 'react-native';
import { ETError, getError } from './Error';
-import { ETInput, module } from './types/common';
+import { ETInput, module, ResourceSource } from './types/common';
const getTypeIdentifier = (arr: ETInput): number => {
if (arr instanceof Int8Array) return 0;
@@ -14,7 +14,7 @@ const getTypeIdentifier = (arr: ETInput): number => {
};
interface Props {
- modelSource: string | number;
+ modelSource: ResourceSource;
module: module;
}