diff --git a/examples/computer-vision/App.tsx b/examples/computer-vision/App.tsx index 8d01269f..488c61cd 100644 --- a/examples/computer-vision/App.tsx +++ b/examples/computer-vision/App.tsx @@ -8,11 +8,13 @@ import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { View, StyleSheet } from 'react-native'; import { ClassificationScreen } from './screens/ClassificationScreen'; import { ObjectDetectionScreen } from './screens/ObjectDetectionScreen'; +import { OCRScreen } from './screens/OCRScreen'; enum ModelType { STYLE_TRANSFER, OBJECT_DETECTION, CLASSIFICATION, + OCR, } export default function App() { @@ -46,6 +48,8 @@ export default function App() { return ( ); + case ModelType.OCR: + return ; default: return ( @@ -64,6 +68,7 @@ export default function App() { 'Style Transfer', 'Object Detection', 'Classification', + 'OCR', ]} onValueChange={(_, selectedIndex) => { handleModeChange(selectedIndex); diff --git a/examples/computer-vision/components/ImageWithOCRBboxes.tsx b/examples/computer-vision/components/ImageWithOCRBboxes.tsx new file mode 100644 index 00000000..1c8fe616 --- /dev/null +++ b/examples/computer-vision/components/ImageWithOCRBboxes.tsx @@ -0,0 +1,103 @@ +// Import necessary components +import React from 'react'; +import { Image, StyleSheet, View } from 'react-native'; +import Svg, { Polygon } from 'react-native-svg'; +import { OCRDetection } from 'react-native-executorch'; + +interface Props { + imageUri: string; + detections: OCRDetection[]; + imageWidth: number; + imageHeight: number; +} + +export default function ImageWithOCRBboxes({ + imageUri, + detections, + imageWidth, + imageHeight, +}: Props) { + const [layout, setLayout] = React.useState({ width: 0, height: 0 }); + + const calculateAdjustedDimensions = () => { + const imageRatio = imageWidth / imageHeight; + const layoutRatio = layout.width / layout.height; + let sx, sy; + if (imageRatio > layoutRatio) { + sx = layout.width / imageWidth; + sy = layout.width / imageRatio / imageHeight; + } else { + sy = layout.height / imageHeight; + sx = (layout.height * imageRatio) / imageWidth; + } + return { + scaleX: sx, + scaleY: sy, + offsetX: (layout.width - imageWidth * sx) / 2, + offsetY: (layout.height - imageHeight * sy) / 2, + }; + }; + + return ( + { + const { width, height } = event.nativeEvent.layout; + setLayout({ width, height }); + }} + > + + + {detections.map((detection, index) => { + const { scaleX, scaleY, offsetX, offsetY } = + calculateAdjustedDimensions(); + const points = detection.bbox.map((point) => ({ + x: point.x * scaleX + offsetX, + y: point.y * scaleY + offsetY, + })); + + const pointsString = points + .map((point) => `${point.x},${point.y}`) + .join(' '); + + return ( + + ); + })} + + + ); +} + +const styles = StyleSheet.create({ + container: { + flex: 1, + position: 'relative', + }, + image: { + flex: 1, + width: '100%', + height: '100%', + }, + svgContainer: { + position: 'absolute', + top: 0, + left: 0, + right: 0, + bottom: 0, + }, +}); diff --git a/examples/computer-vision/screens/OCRScreen.tsx b/examples/computer-vision/screens/OCRScreen.tsx new file mode 100644 index 00000000..9d17118a --- /dev/null +++ b/examples/computer-vision/screens/OCRScreen.tsx @@ -0,0 +1,112 @@ +import Spinner from 'react-native-loading-spinner-overlay'; +import { BottomBar } from '../components/BottomBar'; +import { getImage } from '../utils'; +import { useOCR } from 'react-native-executorch'; +import { View, StyleSheet, Image, Text } from 'react-native'; +import { useState } from 'react'; +import ImageWithBboxes2 from '../components/ImageWithOCRBboxes'; + +export const OCRScreen = ({ + imageUri, + setImageUri, +}: { + imageUri: string; + setImageUri: (imageUri: string) => void; +}) => { + const [results, setResults] = useState([]); + const [imageDimensions, setImageDimensions] = useState<{ + width: number; + height: number; + }>(); + const [detectedText, setDetectedText] = useState(''); + const model = useOCR({ + detectorSource: + 'https://huggingface.co/nklockiewicz/ocr/resolve/main/xnnpack_craft_800.pte', + recognizerSources: { + recognizerLarge: + 'https://huggingface.co/nklockiewicz/ocr/resolve/main/xnnpack_crnn_512.pte', + recognizerMedium: + 'https://huggingface.co/nklockiewicz/ocr/resolve/main/xnnpack_crnn_256.pte', + recognizerSmall: + 'https://huggingface.co/nklockiewicz/ocr/resolve/main/xnnpack_crnn_128.pte', + }, + language: 'en', + }); + + const handleCameraPress = async (isCamera: boolean) => { + const image = await getImage(isCamera); + const width = image?.width; + const height = image?.height; + setImageDimensions({ width: width as number, height: height as number }); + const uri = image?.uri; + if (typeof uri === 'string') { + setImageUri(uri as string); + setResults([]); + setDetectedText(''); + } + }; + + const runForward = async () => { + try { + const output = await model.forward(imageUri); + setResults(output); + console.log(output); + let txt = ''; + output.forEach((detection: any) => { + txt += detection.text + ' '; + }); + setDetectedText(txt); + } catch (e) { + console.error(e); + } + }; + + if (!model.isReady) { + return ( + + ); + } + + return ( + <> + + + {imageUri && imageDimensions?.width && imageDimensions?.height ? ( + + ) : ( + + )} + + {detectedText} + + + + ); +}; + +const styles = StyleSheet.create({ + image: { + flex: 2, + borderRadius: 8, + width: '100%', + }, + imageContainer: { + flex: 6, + width: '100%', + padding: 16, + }, +}); diff --git a/ios/RnExecutorch.xcodeproj/project.pbxproj b/ios/RnExecutorch.xcodeproj/project.pbxproj index af71112a..d7710953 100644 --- a/ios/RnExecutorch.xcodeproj/project.pbxproj +++ b/ios/RnExecutorch.xcodeproj/project.pbxproj @@ -37,12 +37,20 @@ LLM.h, ); }; + 552754CC2D394AC9006B38A2 /* Exceptions for "RnExecutorch" folder in "Compile Sources" phase from "RnExecutorch" target */ = { + isa = PBXFileSystemSynchronizedGroupBuildPhaseMembershipExceptionSet; + buildPhase = 550986852CEF541900FECBB8 /* Sources */; + membershipExceptions = ( + models/ocr/utils/DetectorUtils.h, + ); + }; /* End PBXFileSystemSynchronizedGroupBuildPhaseMembershipExceptionSet section */ /* Begin PBXFileSystemSynchronizedRootGroup section */ 5509868B2CEF541900FECBB8 /* RnExecutorch */ = { isa = PBXFileSystemSynchronizedRootGroup; exceptions = ( + 552754CC2D394AC9006B38A2 /* Exceptions for "RnExecutorch" folder in "Compile Sources" phase from "RnExecutorch" target */, 550986902CEF541900FECBB8 /* Exceptions for "RnExecutorch" folder in "Copy Files" phase from "RnExecutorch" target */, ); path = RnExecutorch; @@ -123,6 +131,7 @@ TargetAttributes = { 550986882CEF541900FECBB8 = { CreatedOnToolsVersion = 16.1; + LastSwiftMigration = 1610; }; }; }; @@ -275,6 +284,7 @@ 550986942CEF541900FECBB8 /* Debug */ = { isa = XCBuildConfiguration; buildSettings = { + CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; OTHER_LDFLAGS = "-ObjC"; PRODUCT_NAME = "$(TARGET_NAME)"; @@ -283,6 +293,8 @@ SUPPORTS_MACCATALYST = NO; SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO; SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + SWIFT_VERSION = 6.0; TARGETED_DEVICE_FAMILY = "1,2"; }; name = Debug; @@ -290,6 +302,7 @@ 550986952CEF541900FECBB8 /* Release */ = { isa = XCBuildConfiguration; buildSettings = { + CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; OTHER_LDFLAGS = "-ObjC"; PRODUCT_NAME = "$(TARGET_NAME)"; @@ -298,6 +311,7 @@ SUPPORTS_MACCATALYST = NO; SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO; SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO; + SWIFT_VERSION = 6.0; TARGETED_DEVICE_FAMILY = "1,2"; }; name = Release; diff --git a/ios/RnExecutorch/OCR.h b/ios/RnExecutorch/OCR.h new file mode 100644 index 00000000..68c08785 --- /dev/null +++ b/ios/RnExecutorch/OCR.h @@ -0,0 +1,7 @@ +#import + +constexpr CGFloat recognizerRatio = 1.6; + +@interface OCR : NSObject + +@end diff --git a/ios/RnExecutorch/OCR.mm b/ios/RnExecutorch/OCR.mm new file mode 100644 index 00000000..cd58d2c4 --- /dev/null +++ b/ios/RnExecutorch/OCR.mm @@ -0,0 +1,83 @@ +#import +#import +#import "OCR.h" +#import "utils/Fetcher.h" +#import "utils/ImageProcessor.h" +#import "models/ocr/Detector.h" +#import "models/ocr/RecognitionHandler.h" + +@implementation OCR { + Detector *detector; + RecognitionHandler *recognitionHandler; +} + +RCT_EXPORT_MODULE() + +- (void)loadModule:(NSString *)detectorSource +recognizerSourceLarge:(NSString *)recognizerSourceLarge +recognizerSourceMedium:(NSString *)recognizerSourceMedium +recognizerSourceSmall:(NSString *)recognizerSourceSmall + symbols:(NSString *)symbols + languageDictPath:(NSString *)languageDictPath + resolve:(RCTPromiseResolveBlock)resolve + reject:(RCTPromiseRejectBlock)reject { + detector = [[Detector alloc] init]; + [detector loadModel:[NSURL URLWithString:detectorSource] completion:^(BOOL success, NSNumber *errorCode) { + if (!success) { + NSError *error = [NSError errorWithDomain:@"OCRErrorDomain" + code:[errorCode intValue] + userInfo:@{NSLocalizedDescriptionKey: [NSString stringWithFormat:@"%ld", (long)[errorCode longValue]]}]; + reject(@"init_module_error", @"Failed to initialize detector module", error); + return; + } + [Fetcher fetchResource:[NSURL URLWithString:languageDictPath] resourceType:ResourceType::TXT completionHandler:^(NSString *filePath, NSError *error) { + if (error) { + reject(@"init_module_error", @"Failed to initialize converter module", error); + return; + } + + self->recognitionHandler = [[RecognitionHandler alloc] initWithSymbols:symbols languageDictPath:filePath]; + [self->recognitionHandler loadRecognizers:recognizerSourceLarge mediumRecognizerPath:recognizerSourceMedium smallRecognizerPath:recognizerSourceSmall completion:^(BOOL allModelsLoaded, NSNumber *errorCode) { + if (allModelsLoaded) { + resolve(@(YES)); + } else { + NSError *error = [NSError errorWithDomain:@"OCRErrorDomain" + code:[errorCode intValue] + userInfo:@{NSLocalizedDescriptionKey: [NSString stringWithFormat:@"%ld", (long)[errorCode longValue]]}]; + reject(@"init_recognizer_error", @"Failed to initialize one or more recognizer models", error); + } + }]; + }]; + }]; +} + +- (void)forward:(NSString *)input + resolve:(RCTPromiseResolveBlock)resolve + reject:(RCTPromiseRejectBlock)reject { + /* + The OCR consists of two phases: + 1. Detection - detecting text regions in the image, the result of this phase is a list of bounding boxes. + 2. Recognition - recognizing the text in the bounding boxes, the result is a list of strings and corresponding confidence scores. + + Recognition uses three models, each model is resposible for recognizing text of different sizes (e.g. large - 512x64, medium - 256x64, small - 128x64). + */ + @try { + cv::Mat image = [ImageProcessor readImage:input]; + NSArray* result = [detector runModel:image]; + cv::Size detectorSize = [detector getModelImageSize]; + cv::cvtColor(image, image, cv::COLOR_BGR2GRAY); + result = [self->recognitionHandler recognize:result imgGray:image desiredWidth:detectorSize.width * recognizerRatio desiredHeight:detectorSize.height * recognizerRatio]; + resolve(result); + } @catch (NSException *exception) { + reject(@"forward_error", [NSString stringWithFormat:@"%@", exception.reason], + nil); + } +} + +- (std::shared_ptr)getTurboModule: +(const facebook::react::ObjCTurboModule::InitParams &)params { + return std::make_shared( + params); +} + +@end diff --git a/ios/RnExecutorch/models/ocr/Detector.h b/ios/RnExecutorch/models/ocr/Detector.h new file mode 100644 index 00000000..34606972 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/Detector.h @@ -0,0 +1,25 @@ +#import "opencv2/opencv.hpp" +#import "BaseModel.h" +#import "RecognitionHandler.h" + +constexpr CGFloat textThreshold = 0.4; +constexpr CGFloat linkThreshold = 0.4; +constexpr CGFloat lowTextThreshold = 0.7; +constexpr CGFloat centerThreshold = 0.5; +constexpr CGFloat distanceThreshold = 2.0; +constexpr CGFloat heightThreshold = 2.0; +constexpr CGFloat restoreRatio = 3.2; +constexpr int minSideThreshold = 15; +constexpr int maxSideThreshold = 30; +constexpr int maxWidth = largeModelWidth + (largeModelWidth * 0.15); +constexpr int minSize = 20; + +const cv::Scalar mean(0.485, 0.456, 0.406); +const cv::Scalar variance(0.229, 0.224, 0.225); + +@interface Detector : BaseModel + +- (cv::Size)getModelImageSize; +- (NSArray *)runModel:(cv::Mat &)input; + +@end diff --git a/ios/RnExecutorch/models/ocr/Detector.mm b/ios/RnExecutorch/models/ocr/Detector.mm new file mode 100644 index 00000000..411c178d --- /dev/null +++ b/ios/RnExecutorch/models/ocr/Detector.mm @@ -0,0 +1,83 @@ +#import "Detector.h" +#import "../../utils/ImageProcessor.h" +#import "utils/DetectorUtils.h" +#import "utils/OCRUtils.h" + +/* + The model used as detector is based on CRAFT (Character Region Awareness for Text Detection) paper. + https://arxiv.org/pdf/1904.01941 + */ + +@implementation Detector { + cv::Size originalSize; + cv::Size modelSize; +} + +- (cv::Size)getModelImageSize{ + if(!modelSize.empty()) { + return modelSize; + } + + NSArray *inputShape = [module getInputShape: @0]; + NSNumber *widthNumber = inputShape.lastObject; + NSNumber *heightNumber = inputShape[inputShape.count - 2]; + + const int height = [heightNumber intValue]; + const int width = [widthNumber intValue]; + modelSize = cv::Size(height, width); + + return cv::Size(height, width); +} + +- (NSArray *)preprocess:(cv::Mat &)input { + /* + Detector as an input accepts tensor with a shape of [1, 3, 800, 800]. + Due to big influence of resize to quality of recognition the image preserves original + aspect ratio and the missing parts are filled with padding. + */ + self->originalSize = cv::Size(input.cols, input.rows); + + cv::Size modelImageSize = [self getModelImageSize]; + cv::Mat resizedImage; + resizedImage = [OCRUtils resizeWithPadding:input desiredWidth:modelImageSize.width desiredHeight:modelImageSize.height]; + NSArray *modelInput = [ImageProcessor matToNSArray: resizedImage mean:mean variance:variance]; + return modelInput; +} + +- (NSArray *)postprocess:(NSArray *)output { + /* + The output of the model consists of two matrices (heat maps): + 1. ScoreText(Score map) - The probability of a region containing character + 2. ScoreAffinity(Affinity map) - affinity between characters, used to to group each character into a single instance (sequence) + Both matrices are 400x400 + + The result of this step is a list of bounding boxes that contain text. + */ + NSArray *predictions = [output objectAtIndex:0]; + + cv::Size modelImageSize = [self getModelImageSize]; + cv::Mat scoreTextCV, scoreAffinityCV; + /* + The output of the model is a matrix in size of input image containing two matrices representing heatmap. + Those two matrices are in the size of half of the input image, that's why the width and height is divided by 2. + */ + [DetectorUtils interleavedArrayToMats:predictions + outputMat1:scoreTextCV + outputMat2:scoreAffinityCV + withSize:cv::Size(modelImageSize.width / 2, modelImageSize.height / 2)]; + NSArray* bBoxesList = [DetectorUtils getDetBoxesFromTextMap:scoreTextCV affinityMap:scoreAffinityCV usingTextThreshold:textThreshold linkThreshold:linkThreshold lowTextThreshold:lowTextThreshold]; + NSLog(@"Detected boxes: %lu", (unsigned long)bBoxesList.count); + bBoxesList = [DetectorUtils restoreBboxRatio:bBoxesList usingRestoreRatio: restoreRatio]; + bBoxesList = [DetectorUtils groupTextBoxes:bBoxesList centerThreshold:centerThreshold distanceThreshold:distanceThreshold heightThreshold:heightThreshold minSideThreshold:minSideThreshold maxSideThreshold:maxSideThreshold maxWidth:maxWidth]; + + return bBoxesList; +} + +- (NSArray *)runModel:(cv::Mat &)input { + NSArray *modelInput = [self preprocess:input]; + NSArray *modelResult = [self forward:modelInput]; + NSArray *result = [self postprocess:modelResult]; + return result; +} + +@end diff --git a/ios/RnExecutorch/models/ocr/RecognitionHandler.h b/ios/RnExecutorch/models/ocr/RecognitionHandler.h new file mode 100644 index 00000000..72ec004f --- /dev/null +++ b/ios/RnExecutorch/models/ocr/RecognitionHandler.h @@ -0,0 +1,16 @@ +#import "opencv2/opencv.hpp" + +constexpr int modelHeight = 64; +constexpr int largeModelWidth = 512; +constexpr int mediumModelWidth = 256; +constexpr int smallModelWidth = 128; +constexpr CGFloat lowConfidenceThreshold = 0.3; +constexpr CGFloat adjustContrast = 0.2; + +@interface RecognitionHandler : NSObject + +- (instancetype)initWithSymbols:(NSString *)symbols languageDictPath:(NSString *)languageDictPath; +- (void)loadRecognizers:(NSString *)largeRecognizerPath mediumRecognizerPath:(NSString *)mediumRecognizerPath smallRecognizerPath:(NSString *)smallRecognizerPath completion:(void (^)(BOOL, NSNumber *))completion; +- (NSArray *)recognize:(NSArray *)bBoxesList imgGray:(cv::Mat)imgGray desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight; + +@end diff --git a/ios/RnExecutorch/models/ocr/RecognitionHandler.mm b/ios/RnExecutorch/models/ocr/RecognitionHandler.mm new file mode 100644 index 00000000..50e303df --- /dev/null +++ b/ios/RnExecutorch/models/ocr/RecognitionHandler.mm @@ -0,0 +1,124 @@ +#import +#import "ExecutorchLib/ETModel.h" +#import "../../utils/Fetcher.h" +#import "../../utils/ImageProcessor.h" +#import "./utils/CTCLabelConverter.h" +#import "./utils/OCRUtils.h" +#import "./utils/RecognizerUtils.h" +#import "Recognizer.h" +#import "RecognitionHandler.h" + +/* + RecognitionHandler class is responsible for loading and choosing the appropriate recognizer model based on the input image size, + it also handles converting the model output to text. + */ + +@implementation RecognitionHandler { + Recognizer *recognizerLarge; + Recognizer *recognizerMedium; + Recognizer *recognizerSmall; + CTCLabelConverter *converter; +} + +- (instancetype)initWithSymbols:(NSString *)symbols languageDictPath:(NSString *)languageDictPath { + self = [super init]; + if (self) { + recognizerLarge = [[Recognizer alloc] init]; + recognizerMedium = [[Recognizer alloc] init]; + recognizerSmall = [[Recognizer alloc] init]; + + converter = [[CTCLabelConverter alloc] initWithCharacters:symbols separatorList:@{} dictPathList:@{@"key": languageDictPath}]; + } + return self; +} + +- (void)loadRecognizers:(NSString *)largeRecognizerPath mediumRecognizerPath:(NSString *)mediumRecognizerPath smallRecognizerPath:(NSString *)smallRecognizerPath completion:(void (^)(BOOL, NSNumber *))completion { + dispatch_group_t group = dispatch_group_create(); + __block BOOL allSuccessful = YES; + + NSArray *recognizers = @[recognizerLarge, recognizerMedium, recognizerSmall]; + NSArray *paths = @[largeRecognizerPath, mediumRecognizerPath, smallRecognizerPath]; + + for (NSInteger i = 0; i < recognizers.count; i++) { + Recognizer *recognizer = recognizers[i]; + NSString *path = paths[i]; + + dispatch_group_enter(group); + [recognizer loadModel:[NSURL URLWithString: path] completion:^(BOOL success, NSNumber *errorCode) { + if (!success) { + allSuccessful = NO; + dispatch_group_leave(group); + completion(NO, errorCode); + return; + } + dispatch_group_leave(group); + }]; + } + + dispatch_group_notify(group, dispatch_get_main_queue(), ^{ + if (allSuccessful) { + completion(YES, @(0)); + } + }); +} + +- (NSArray *)runModel:(cv::Mat)croppedImage { + NSArray *result; + if(croppedImage.cols >= largeModelWidth) { + result = [recognizerLarge runModel:croppedImage]; + } else if (croppedImage.cols >= mediumModelWidth) { + result = [recognizerMedium runModel: croppedImage]; + } else { + result = [recognizerSmall runModel: croppedImage]; + } + + return result; +} + +- (NSArray *)recognize: (NSArray *)bBoxesList imgGray:(cv::Mat)imgGray desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight { + NSDictionary* ratioAndPadding = [RecognizerUtils calculateResizeRatioAndPaddings:imgGray.cols height:imgGray.rows desiredWidth:desiredWidth desiredHeight:desiredHeight]; + const int left = [ratioAndPadding[@"left"] intValue]; + const int top = [ratioAndPadding[@"top"] intValue]; + const CGFloat resizeRatio = [ratioAndPadding[@"resizeRatio"] floatValue]; + imgGray = [OCRUtils resizeWithPadding:imgGray desiredWidth:desiredWidth desiredHeight:desiredHeight]; + + NSMutableArray *predictions = [NSMutableArray array]; + for (NSDictionary *box in bBoxesList) { + cv::Mat croppedImage = [RecognizerUtils getCroppedImage:box image:imgGray modelHeight:modelHeight]; + if (croppedImage.empty()) { + continue; + } + croppedImage = [RecognizerUtils normalizeForRecognizer:croppedImage adjustContrast:adjustContrast]; + NSArray *result = [self runModel: croppedImage]; + + + NSNumber *confidenceScore = [result objectAtIndex:1]; + if([confidenceScore floatValue] < lowConfidenceThreshold){ + cv::rotate(croppedImage, croppedImage, cv::ROTATE_180); + + NSArray *rotatedResult = [self runModel: croppedImage]; + NSNumber *rotatedConfidenceScore = [rotatedResult objectAtIndex:1]; + + if ([rotatedConfidenceScore floatValue] > [confidenceScore floatValue]) { + result = rotatedResult; + confidenceScore = rotatedConfidenceScore; + } + } + + NSArray *predIndex = [result objectAtIndex:0]; + NSArray* decodedTexts = [converter decodeGreedy:predIndex length:(int)(predIndex.count)]; + + NSMutableArray *bbox = [NSMutableArray arrayWithCapacity:4]; + for (NSValue *coords in box[@"bbox"]){ + const CGPoint point = [coords CGPointValue]; + [bbox addObject: @{@"x": @((point.x - left) * resizeRatio), @"y": @((point.y - top) * resizeRatio)}]; + } + + NSDictionary *res = @{@"text": decodedTexts[0], @"bbox": bbox, @"score": confidenceScore}; + [predictions addObject:res]; + } + + return predictions; +} + +@end diff --git a/ios/RnExecutorch/models/ocr/Recognizer.h b/ios/RnExecutorch/models/ocr/Recognizer.h new file mode 100644 index 00000000..63047ac0 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/Recognizer.h @@ -0,0 +1,8 @@ +#import "opencv2/opencv.hpp" +#import "BaseModel.h" + +@interface Recognizer : BaseModel + +- (NSArray *)runModel:(cv::Mat &)input; + +@end diff --git a/ios/RnExecutorch/models/ocr/Recognizer.mm b/ios/RnExecutorch/models/ocr/Recognizer.mm new file mode 100644 index 00000000..a6d9f713 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/Recognizer.mm @@ -0,0 +1,72 @@ +#import "Recognizer.h" +#import "RecognizerUtils.h" +#import "../../utils/ImageProcessor.h" +#import "utils/OCRUtils.h" + +/* + The model used as detector is based on CRNN paper. + https://arxiv.org/pdf/1507.05717 + */ + +@implementation Recognizer { + cv::Size originalSize; +} + +- (cv::Size)getModelImageSize{ + NSArray *inputShape = [module getInputShape: @0]; + NSNumber *widthNumber = inputShape.lastObject; + NSNumber *heightNumber = inputShape[inputShape.count - 2]; + + const int height = [heightNumber intValue]; + const int width = [widthNumber intValue]; + return cv::Size(height, width); +} + +- (cv::Size)getModelOutputSize{ + NSArray *outputShape = [module getOutputShape: @0]; + NSNumber *widthNumber = outputShape.lastObject; + NSNumber *heightNumber = outputShape[outputShape.count - 2]; + + const int height = [heightNumber intValue]; + const int width = [widthNumber intValue]; + return cv::Size(height, width); +} + +- (NSArray *)preprocess:(cv::Mat &)input { + return [ImageProcessor matToNSArrayGray:input]; +} + +- (NSArray *)postprocess:(NSArray *)output { + const int modelOutputHeight = [self getModelOutputSize].height; + NSInteger numElements = [output.firstObject count]; + NSInteger numRows = (numElements + modelOutputHeight - 1) / modelOutputHeight; + cv::Mat resultMat = cv::Mat::zeros(numRows, modelOutputHeight, CV_32F); + NSInteger counter = 0; + NSInteger currentRow = 0; + for (NSNumber *num in output.firstObject) { + resultMat.at(currentRow, counter) = [num floatValue]; + counter++; + if (counter >= modelOutputHeight) { + counter = 0; currentRow++; + } + } + + cv::Mat probabilities = [RecognizerUtils softmax:resultMat]; + NSMutableArray *predsNorm = [RecognizerUtils sumProbabilityRows:probabilities modelOutputHeight:modelOutputHeight]; + probabilities = [RecognizerUtils divideMatrix:probabilities byVector:predsNorm]; + NSArray *maxValuesIndices = [RecognizerUtils findMaxValuesAndIndices:probabilities]; + const CGFloat confidenceScore = [RecognizerUtils computeConfidenceScore:maxValuesIndices[0] indicesArray:maxValuesIndices[1]]; + + return @[maxValuesIndices[1], @(confidenceScore)]; +} + +- (NSArray *)runModel:(cv::Mat &)input { + NSArray *modelInput = [self preprocess:input]; + NSArray *modelResult = [self forward:modelInput]; + NSArray *result = [self postprocess:modelResult]; + + return result; +} + + +@end diff --git a/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.h b/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.h new file mode 100644 index 00000000..037782f4 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.h @@ -0,0 +1,15 @@ +#import + +@interface CTCLabelConverter : NSObject + +@property(strong, nonatomic) NSMutableDictionary *dict; +@property(strong, nonatomic) NSArray *character; +@property(strong, nonatomic) NSDictionary *separatorList; +@property(strong, nonatomic) NSArray *ignoreIdx; +@property(strong, nonatomic) NSDictionary *dictList; + +- (instancetype)initWithCharacters:(NSString *)characters separatorList:(NSDictionary *)separatorList dictPathList:(NSDictionary *)dictPathList; +- (void)loadDictionariesWithDictPathList:(NSDictionary *)dictPathList; +- (NSArray *)decodeGreedy:(NSArray *)textIndex length:(NSInteger)length; + +@end diff --git a/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.mm b/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.mm new file mode 100644 index 00000000..644a29e2 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/utils/CTCLabelConverter.mm @@ -0,0 +1,93 @@ +#import "CTCLabelConverter.h" + +@implementation CTCLabelConverter + +- (instancetype)initWithCharacters:(NSString *)characters separatorList:(NSDictionary *)separatorList dictPathList:(NSDictionary *)dictPathList { + self = [super init]; + if (self) { + _dict = [NSMutableDictionary dictionary]; + NSMutableArray *mutableCharacters = [NSMutableArray arrayWithObject:@"[blank]"]; + + for (NSUInteger i = 0; i < [characters length]; i++) { + NSString *charStr = [NSString stringWithFormat:@"%C", [characters characterAtIndex:i]]; + [mutableCharacters addObject:charStr]; + self.dict[charStr] = @(i + 1); + } + + _character = [mutableCharacters copy]; + _separatorList = separatorList; + + NSMutableArray *ignoreIndexes = [NSMutableArray arrayWithObject:@(0)]; + for (NSString *sep in separatorList.allValues) { + NSUInteger index = [characters rangeOfString:sep].location; + if (index != NSNotFound) { + [ignoreIndexes addObject:@(index)]; + } + } + _ignoreIdx = [ignoreIndexes copy]; + _dictList = [NSDictionary dictionary]; + [self loadDictionariesWithDictPathList:dictPathList]; + } + return self; +} + +- (void)loadDictionariesWithDictPathList:(NSDictionary *)dictPathList { + NSMutableDictionary *tempDictList = [NSMutableDictionary dictionary]; + for (NSString *lang in dictPathList.allKeys) { + NSString *dictPath = dictPathList[lang]; + NSError *error; + NSString *fileContents = [NSString stringWithContentsOfFile:dictPath encoding:NSUTF8StringEncoding error:&error]; + if (error) { + NSLog(@"Error reading file: %@", error.localizedDescription); + continue; + } + NSArray *lines = [fileContents componentsSeparatedByCharactersInSet:[NSCharacterSet newlineCharacterSet]]; + [tempDictList setObject:lines forKey:lang]; + } + _dictList = [tempDictList copy]; +} + +- (NSArray *)decodeGreedy:(NSArray *)textIndex length:(NSInteger)length { + NSMutableArray *texts = [NSMutableArray array]; + NSUInteger index = 0; + + while (index < textIndex.count) { + NSUInteger segmentLength = MIN(length, textIndex.count - index); + NSRange range = NSMakeRange(index, segmentLength); + NSArray *subArray = [textIndex subarrayWithRange:range]; + + NSMutableString *text = [NSMutableString string]; + NSNumber *lastChar = nil; + + NSMutableArray *isNotRepeated = [NSMutableArray arrayWithObject:@YES]; + NSMutableArray *isNotIgnored = [NSMutableArray array]; + + for (NSUInteger i = 0; i < subArray.count; i++) { + NSNumber *currentChar = subArray[i]; + if (i > 0) { + [isNotRepeated addObject:@(![lastChar isEqualToNumber:currentChar])]; + } + [isNotIgnored addObject:@(![self.ignoreIdx containsObject:currentChar])]; + + lastChar = currentChar; + } + + for (NSUInteger j = 0; j < subArray.count; j++) { + if ([isNotRepeated[j] boolValue] && [isNotIgnored[j] boolValue]) { + NSUInteger charIndex = [subArray[j] unsignedIntegerValue]; + [text appendString:self.character[charIndex]]; + } + } + + [texts addObject:text.copy]; + index += segmentLength; + + if (segmentLength < length) { + break; + } + } + + return texts.copy; +} + +@end diff --git a/ios/RnExecutorch/models/ocr/utils/DetectorUtils.h b/ios/RnExecutorch/models/ocr/utils/DetectorUtils.h new file mode 100644 index 00000000..8330cf98 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/utils/DetectorUtils.h @@ -0,0 +1,21 @@ +#import + +constexpr int verticalLineThreshold = 20; + +@interface DetectorUtils : NSObject + ++ (void)interleavedArrayToMats:(NSArray *)array + outputMat1:(cv::Mat &)mat1 + outputMat2:(cv::Mat &)mat2 + withSize:(cv::Size)size; ++ (NSArray *)getDetBoxesFromTextMap:(cv::Mat)textMap affinityMap:(cv::Mat)affinityMap usingTextThreshold:(CGFloat)textThreshold linkThreshold:(CGFloat)linkThreshold lowTextThreshold:(CGFloat)lowTextThreshold; ++ (NSArray *)restoreBboxRatio:(NSArray *)boxes usingRestoreRatio:(CGFloat)restoreRatio; ++ (NSArray *)groupTextBoxes:(NSArray *)polys + centerThreshold:(CGFloat)centerThreshold + distanceThreshold:(CGFloat)distanceThreshold + heightThreshold:(CGFloat)heightThreshold + minSideThreshold:(int)minSideThreshold + maxSideThreshold:(int)maxSideThreshold + maxWidth:(int)maxWidth; + +@end diff --git a/ios/RnExecutorch/models/ocr/utils/DetectorUtils.mm b/ios/RnExecutorch/models/ocr/utils/DetectorUtils.mm new file mode 100644 index 00000000..5e49f1f0 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/utils/DetectorUtils.mm @@ -0,0 +1,547 @@ +#import "DetectorUtils.h" + +@implementation DetectorUtils + ++ (void)interleavedArrayToMats:(NSArray *)array + outputMat1:(cv::Mat &)mat1 + outputMat2:(cv::Mat &)mat2 + withSize:(cv::Size)size { + mat1 = cv::Mat(size.height, size.width, CV_32F); + mat2 = cv::Mat(size.height, size.width, CV_32F); + + for (NSUInteger idx = 0; idx < array.count; idx++) { + const CGFloat value = [array[idx] doubleValue]; + const int x = (idx / 2) % size.width; + const int y = (idx / 2) / size.width; + + if (idx % 2 == 0) { + mat1.at(y, x) = value; + } else { + mat2.at(y, x) = value; + } + } +} + +/** + * This method applies a series of image processing operations to identify likely areas of text in the textMap and return the bounding boxes for single words. + * + * @param textMap A cv::Mat representing a heat map of the characters of text being present in an image. + * @param affinityMap A cv::Mat representing a heat map of the affinity between characters. + * @param textThreshold A CGFloat representing the threshold for the text map. + * @param linkThreshold A CGFloat representing the threshold for the affinity map. + * @param lowTextThreshold A CGFloat representing the low text. + * + * @return An NSArray containing NSDictionary objects. Each dictionary includes: + * - "bbox": an NSArray of CGPoint values representing the vertices of the detected text box. + * - "angle": an NSNumber representing the rotation angle of the box. + */ ++ (NSArray *)getDetBoxesFromTextMap:(cv::Mat)textMap affinityMap:(cv::Mat)affinityMap usingTextThreshold:(CGFloat)textThreshold linkThreshold:(CGFloat)linkThreshold lowTextThreshold:(CGFloat)lowTextThreshold { + const int imgH = textMap.rows; + const int imgW = textMap.cols; + cv::Mat textScore; + cv::Mat affinityScore; + cv::threshold(textMap, textScore, textThreshold, 1, cv::THRESH_BINARY); + cv::threshold(affinityMap, affinityScore, linkThreshold, 1, cv::THRESH_BINARY); + cv::Mat textScoreComb = textScore + affinityScore; + cv::threshold(textScoreComb, textScoreComb, 0, 1, cv::THRESH_BINARY); + cv::Mat binaryMat; + textScoreComb.convertTo(binaryMat, CV_8UC1); + + cv::Mat labels, stats, centroids; + const int nLabels = cv::connectedComponentsWithStats(binaryMat, labels, stats, centroids, 4); + + NSMutableArray *detectedBoxes = [NSMutableArray array]; + for (int i = 1; i < nLabels; i++) { + const int area = stats.at(i, cv::CC_STAT_AREA); + if (area < 10) continue; + + cv::Mat mask = (labels == i); + CGFloat maxVal; + cv::minMaxLoc(textMap, NULL, &maxVal, NULL, NULL, mask); + if (maxVal < lowTextThreshold) continue; + + cv::Mat segMap = cv::Mat::zeros(textMap.size(), CV_8U); + segMap.setTo(255, mask); + + const int x = stats.at(i, cv::CC_STAT_LEFT); + const int y = stats.at(i, cv::CC_STAT_TOP); + const int w = stats.at(i, cv::CC_STAT_WIDTH); + const int h = stats.at(i, cv::CC_STAT_HEIGHT); + const int dilationRadius = (int)(sqrt((double)(area / MAX(w, h)) ) * 2.0); + const int sx = MAX(x - dilationRadius, 0); + const int ex = MIN(x + w + dilationRadius + 1, imgW); + const int sy = MAX(y - dilationRadius, 0); + const int ey = MIN(y + h + dilationRadius + 1, imgH); + + cv::Rect roi(sx, sy, ex - sx, ey - sy); + cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + dilationRadius, 1 + dilationRadius)); + cv::Mat roiSegMap = segMap(roi); + cv::dilate(roiSegMap, roiSegMap, kernel); + + std::vector> contours; + cv::findContours(segMap, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE); + if (!contours.empty()) { + cv::RotatedRect minRect = cv::minAreaRect(contours[0]); + cv::Point2f vertices[4]; + minRect.points(vertices); + NSMutableArray *pointsArray = [NSMutableArray arrayWithCapacity:4]; + for (int j = 0; j < 4; j++) { + const CGPoint point = CGPointMake(vertices[j].x, vertices[j].y); + [pointsArray addObject:[NSValue valueWithCGPoint:point]]; + } + NSDictionary *dict = @{@"bbox": pointsArray, @"angle": @(minRect.angle)}; + [detectedBoxes addObject:dict]; + } + } + + return detectedBoxes; +} + ++ (NSArray *)restoreBboxRatio:(NSArray *)boxes usingRestoreRatio:(CGFloat)restoreRatio { + NSMutableArray *result = [NSMutableArray array]; + for (NSUInteger i = 0; i < [boxes count]; i++) { + NSDictionary *box = boxes[i]; + NSMutableArray *boxArray = [NSMutableArray arrayWithCapacity:4]; + for (NSValue *value in box[@"bbox"]) { + CGPoint point = [value CGPointValue]; + point.x *= restoreRatio; + point.y *= restoreRatio; + [boxArray addObject:[NSValue valueWithCGPoint:point]]; + } + NSDictionary *dict = @{@"bbox": boxArray, @"angle": box[@"angle"]}; + [result addObject:dict]; + } + + return result; +} + +/** + * This method normalizes angle returned from cv::minAreaRect function which ranges from 0 to 90 degrees. + **/ ++ (CGFloat)normalizeAngle:(CGFloat)angle { + if (angle > 45) { + return angle - 90; + } + return angle; +} + ++ (CGPoint)midpointBetweenPoint:(CGPoint)p1 andPoint:(CGPoint)p2 { + return CGPointMake((p1.x + p2.x) / 2, (p1.y + p2.y) / 2); +} + ++ (CGFloat)distanceFromPoint:(CGPoint)p1 toPoint:(CGPoint)p2 { + const CGFloat xDist = (p2.x - p1.x); + const CGFloat yDist = (p2.y - p1.y); + return sqrt(xDist * xDist + yDist * yDist); +} + ++ (CGPoint)centerOfBox:(NSArray *)box { + return [self midpointBetweenPoint:[box[0] CGPointValue] andPoint:[box[2] CGPointValue]]; +} + ++ (CGFloat)maxSideLength:(NSArray *)points { + CGFloat maxSideLength = 0; + NSInteger numOfPoints = points.count; + for (NSInteger i = 0; i < numOfPoints; i++) { + const CGPoint currentPoint = [points[i] CGPointValue]; + const CGPoint nextPoint = [points[(i + 1) % numOfPoints] CGPointValue]; + + const CGFloat sideLength = [self distanceFromPoint:currentPoint toPoint:nextPoint]; + if (sideLength > maxSideLength) { + maxSideLength = sideLength; + } + } + return maxSideLength; +} + ++ (CGFloat)minSideLength:(NSArray *)points { + CGFloat minSideLength = CGFLOAT_MAX; + NSInteger numOfPoints = points.count; + + for (NSInteger i = 0; i < numOfPoints; i++) { + const CGPoint currentPoint = [points[i] CGPointValue]; + const CGPoint nextPoint = [points[(i + 1) % numOfPoints] CGPointValue]; + + const CGFloat sideLength = [self distanceFromPoint:currentPoint toPoint:nextPoint]; + if (sideLength < minSideLength) { + minSideLength = sideLength; + } + } + + return minSideLength; +} + ++ (CGFloat)calculateMinimalDistanceBetweenBox:(NSArray *)box1 andBox:(NSArray *)box2 { + CGFloat minDistance = CGFLOAT_MAX; + for (NSValue *value1 in box1) { + const CGPoint corner1 = [value1 CGPointValue]; + for (NSValue *value2 in box2) { + const CGPoint corner2 = [value2 CGPointValue]; + const CGFloat distance = [self distanceFromPoint:corner1 toPoint:corner2]; + if (distance < minDistance) { + minDistance = distance; + } + } + } + return minDistance; +} + ++ (NSArray *)rotateBox:(NSArray *)box withAngle:(CGFloat)angle { + const CGPoint center = [self centerOfBox:box]; + + const CGFloat radians = angle * M_PI / 180.0; + + NSMutableArray *rotatedPoints = [NSMutableArray arrayWithCapacity:4]; + for (NSValue *value in box) { + const CGPoint point = [value CGPointValue]; + + const CGFloat translatedX = point.x - center.x; + const CGFloat translatedY = point.y - center.y; + + const CGFloat rotatedX = translatedX * cos(radians) - translatedY * sin(radians); + const CGFloat rotatedY = translatedX * sin(radians) + translatedY * cos(radians); + + const CGPoint rotatedPoint = CGPointMake(rotatedX + center.x, rotatedY + center.y); + [rotatedPoints addObject:[NSValue valueWithCGPoint:rotatedPoint]]; + } + + return rotatedPoints; +} + +/** + * Orders a set of points in a clockwise direction starting with the top-left point. + * + * Process: + * 1. It iterates through each CGPoint extracted from the NSValues. + * 2. For each point, it calculates the sum (x + y) and difference (y - x) of the coordinates. + * 3. Points are classified into: + * - Top-left: Minimum sum. + * - Bottom-right: Maximum sum. + * - Top-right: Minimum difference. + * - Bottom-left: Maximum difference. + * 4. The points are ordered starting from the top-left in a clockwise manner: top-left, top-right, bottom-right, bottom-left. + */ ++ (NSArray *)orderPointsClockwise:(NSArray *)points{ + CGPoint topLeft, topRight, bottomRight, bottomLeft; + CGFloat minSum = FLT_MAX; + CGFloat maxSum = -FLT_MAX; + CGFloat minDiff = FLT_MAX; + CGFloat maxDiff = -FLT_MAX; + + for (NSValue *value in points) { + const CGPoint pt = [value CGPointValue]; + const CGFloat sum = pt.x + pt.y; + const CGFloat diff = pt.y - pt.x; + + if (sum < minSum) { + minSum = sum; + topLeft = pt; + } + if (sum > maxSum) { + maxSum = sum; + bottomRight = pt; + } + if (diff < minDiff) { + minDiff = diff; + topRight = pt; + } + if (diff > maxDiff) { + maxDiff = diff; + bottomLeft = pt; + } + } + + NSArray *rect = @[[NSValue valueWithCGPoint:topLeft], + [NSValue valueWithCGPoint:topRight], + [NSValue valueWithCGPoint:bottomRight], + [NSValue valueWithCGPoint:bottomLeft]]; + + return rect; +} + ++ (std::vector)pointsFromNSValues:(NSArray *)nsValues { + std::vector points; + for (NSValue *value in nsValues) { + const CGPoint point = [value CGPointValue]; + points.emplace_back(point.x, point.y); + } + return points; +} + ++ (NSArray *)nsValuesFromPoints:(cv::Point2f *)points count:(int)count { + NSMutableArray *nsValues = [[NSMutableArray alloc] initWithCapacity:count]; + for (int i = 0; i < count; i++) { + [nsValues addObject:[NSValue valueWithCGPoint:CGPointMake(points[i].x, points[i].y)]]; + } + return nsValues; +} + ++ (NSArray *)mergeRotatedBoxes:(NSArray *)box1 withBox:(NSArray *)box2 { + box1 = [self orderPointsClockwise:box1]; + box2 = [self orderPointsClockwise:box2]; + + std::vector points1 = [self pointsFromNSValues:box1]; + std::vector points2 = [self pointsFromNSValues:box2]; + + std::vector allPoints; + allPoints.insert(allPoints.end(), points1.begin(), points1.end()); + allPoints.insert(allPoints.end(), points2.begin(), points2.end()); + + std::vector hullIndices; + cv::convexHull(allPoints, hullIndices, false); + + std::vector hullPoints; + for (int idx : hullIndices) { + hullPoints.push_back(allPoints[idx]); + } + + cv::RotatedRect minAreaRect = cv::minAreaRect(hullPoints); + + cv::Point2f rectPoints[4]; + minAreaRect.points(rectPoints); + + return [self nsValuesFromPoints:rectPoints count:4]; +} + ++ (NSMutableArray *)removeSmallBoxesFromArray:(NSArray *)boxes usingMinSideThreshold:(CGFloat)minSideThreshold maxSideThreshold:(CGFloat)maxSideThreshold { + NSMutableArray *filteredBoxes = [NSMutableArray array]; + + for (NSDictionary *box in boxes) { + const CGFloat maxSideLength = [self maxSideLength:box[@"bbox"]]; + const CGFloat minSideLength = [self minSideLength:box[@"bbox"]]; + if (minSideLength > minSideThreshold && maxSideLength > maxSideThreshold) { + [filteredBoxes addObject:box]; + } + } + + return filteredBoxes; +} + ++ (CGFloat)minimumYFromBox:(NSArray *)box { + __block CGFloat minY = CGFLOAT_MAX; + [box enumerateObjectsUsingBlock:^(NSValue * _Nonnull obj, NSUInteger idx, BOOL * _Nonnull stop) { + const CGPoint pt = [obj CGPointValue]; + if (pt.y < minY) { + minY = pt.y; + } + }]; + return minY; +} + +/** + * This method calculates the distances between each sequential pair of points in a presumed quadrilateral, + * identifies the two shortest sides, and fits a linear model to the midpoints of these sides. It also evaluates + * whether the resulting line should be considered vertical based on a predefined threshold for the x-coordinate differences. + * + * If the line is vertical it is fitted as a function of x = my + c, otherwise as y = mx + c. + * + * @return A NSDictionary containing: + * - "slope": NSNumber representing the slope (m) of the line. + * - "intercept": NSNumber representing the line's intercept (c) with y-axis. + * - "isVertical": NSNumber (boolean) indicating whether the line is considered vertical. + */ ++ (NSDictionary *)fitLineToShortestSides:(NSArray *)points { + NSMutableArray *sides = [NSMutableArray array]; + NSMutableArray *midpoints = [NSMutableArray array]; + + for (int i = 0; i < 4; i++) { + const CGPoint p1 = [points[i] CGPointValue]; + const CGPoint p2 = [points[(i + 1) % 4] CGPointValue]; + + const CGFloat sideLength = [self distanceFromPoint:p1 toPoint:p2]; + [sides addObject:@{@"length": @(sideLength), @"index": @(i)}]; + [midpoints addObject:[NSValue valueWithCGPoint:[self midpointBetweenPoint:p1 andPoint:p2]]]; + } + + [sides sortUsingDescriptors:@[[NSSortDescriptor sortDescriptorWithKey:@"length" ascending:YES]]]; + + const CGPoint midpoint1 = [midpoints[[sides[0][@"index"] intValue]] CGPointValue]; + const CGPoint midpoint2 = [midpoints[[sides[1][@"index"] intValue]] CGPointValue]; + const CGFloat dx = fabs(midpoint2.x - midpoint1.x); + + CGFloat m, c; + BOOL isVertical; + + std::vector cvMidPoints = {cv::Point2f(midpoint1.x, midpoint1.y), cv::Point2f(midpoint2.x, midpoint2.y)}; + cv::Vec4f line; + + if (dx < verticalLineThreshold) { + for (auto &pt : cvMidPoints) std::swap(pt.x, pt.y); + cv::fitLine(cvMidPoints, line, cv::DIST_L2, 0, 0.01, 0.01); + m = line[1] / line[0]; + c = line[3] - m * line[2]; + isVertical = YES; + } else { + cv::fitLine(cvMidPoints, line, cv::DIST_L2, 0, 0.01, 0.01); + m = line[1] / line[0]; + c = line[3] - m * line[2]; + isVertical = NO; + } + + return @{@"slope": @(m), @"intercept": @(c), @"isVertical": @(isVertical)}; +} + +/** + * This method assesses each box from a provided array, checks its center against the center of a "current box", + * and evaluates its alignment with a specified line equation. The function specifically searches for the box + * whose center is closest to the current box, that has not been ignored, and fits within a defined distance from the line. + * + * @param boxes An NSArray of NSDictionary objects where each dictionary represents a box with keys "bbox" and "angle". + * "bbox" is an NSArray of NSValue objects each encapsulating CGPoint that define the box vertices. + * "angle" is a NSNumber representing the box's rotation angle. + * @param ignoredIdxs An NSSet of NSNumber objects representing indices of boxes to ignore in the evaluation. + * @param currentBox An NSArray of NSValue objects encapsulating CGPoints representing the current box to compare against. + * @param isVertical A pointer to a BOOL indicating if the line to compare distance to is vertical. + * @param m The slope (gradient) of the line against which the box's alignment is checked. + * @param c The y-intercept of the line equation y = mx + c. + * @param centerThreshold A multiplier to determine the threshold for the distance between the box's center and the line. + * + * @return A NSDictionary containing: + * - "idx" : NSNumber indicating the index of the found box in the original NSArray. + * - "boxHeight" : NSNumber representing the shortest side length of the found box. + * Returns nil if no suitable box is found. + */ ++ (NSDictionary *)findClosestBox:(NSArray *)boxes + ignoredIdxs:(NSSet *)ignoredIdxs + currentBox:(NSArray *)currentBox + isVertical:(BOOL)isVertical + m:(CGFloat)m + c:(CGFloat)c + centerThreshold:(CGFloat)centerThreshold +{ + CGFloat smallestDistance = CGFLOAT_MAX; + NSInteger idx = -1; + CGFloat boxHeight = 0; + const CGPoint centerOfCurrentBox = [self centerOfBox:currentBox]; + + for (NSUInteger i = 0; i < boxes.count; i++) { + if ([ignoredIdxs containsObject:@(i)]) { + continue; + } + NSArray *bbox = boxes[i][@"bbox"]; + const CGPoint centerOfProcessedBox = [self centerOfBox:bbox]; + const CGFloat distanceBetweenCenters = [self distanceFromPoint:centerOfCurrentBox toPoint:centerOfProcessedBox]; + + if (distanceBetweenCenters >= smallestDistance) { + continue; + } + + boxHeight = [self minSideLength:bbox]; + + const CGFloat lineDistance = (isVertical ? + fabs(centerOfProcessedBox.x - (m * centerOfProcessedBox.y + c)) : + fabs(centerOfProcessedBox.y - (m * centerOfProcessedBox.x + c))); + + if (lineDistance < boxHeight * centerThreshold) { + idx = i; + smallestDistance = distanceBetweenCenters; + } + } + + return idx != -1 ? @{@"idx": @(idx), @"boxHeight": @(boxHeight)} : nil; +} + +/** + * This method processes an array of text box dictionaries, each containing details about individual text boxes, + * and attempts to group and merge these boxes based on specified criteria including proximity, alignment, + * and size thresholds. It prioritizes merging of boxes that are aligned closely in angle, are near each other, + * and whose sizes are compatible based on the given thresholds. + * + * @param boxes An array of NSDictionary objects where each dictionary represents a text box. Each dictionary must have + * at least a "bbox" key with an NSArray of NSValue wrapping CGPoints defining the box vertices, + * and an "angle" key indicating the orientation of the box. + * @param centerThreshold A CGFloat representing the threshold for considering the distance between center and fitted line. + * @param distanceThreshold A CGFloat that defines the maximum allowed distance between boxes for them to be considered for merging. + * @param heightThreshold A CGFloat representing the maximum allowed difference in height between boxes for merging. + * @param minSideThreshold An int that defines the minimum dimension threshold to filter out small boxes after grouping. + * @param maxSideThreshold An int that specifies the maximum dimension threshold for filtering boxes post-grouping. + * @param maxWidth An int that represents the maximum width allowable for a merged box. + * + * @return An NSArray of NSDictionary objects representing the merged boxes. Each dictionary contains: + * - "bbox": An NSArray of NSValue each containing a CGPoint that defines the vertices of the merged box. + * - "angle": NSNumber representing the computed orientation of the merged box. + * + * Processing Steps: + * 1. Sort initial boxes based on their maximum side length. + * 2. Sequentially merge boxes considering alignment, proximity, and size compatibility. + * 3. Post-processing to remove any boxes that are too small or exceed max side criteria. + * 4. Sort the final array of boxes by their vertical positions. + */ ++ (NSArray *)groupTextBoxes:(NSMutableArray *)boxes + centerThreshold:(CGFloat)centerThreshold + distanceThreshold:(CGFloat)distanceThreshold + heightThreshold:(CGFloat)heightThreshold + minSideThreshold:(int)minSideThreshold + maxSideThreshold:(int)maxSideThreshold + maxWidth:(int)maxWidth +{ + // Sort boxes based on their maximum side length + boxes = [boxes sortedArrayUsingComparator:^NSComparisonResult(NSDictionary *obj1, NSDictionary *obj2) { + const CGFloat maxLen1 = [self maxSideLength:obj1[@"bbox"]]; + const CGFloat maxLen2 = [self maxSideLength:obj2[@"bbox"]]; + return (maxLen1 < maxLen2) ? NSOrderedDescending : (maxLen1 > maxLen2) ? NSOrderedAscending : NSOrderedSame; + }].mutableCopy; + + NSMutableArray *mergedArray = [NSMutableArray array]; + CGFloat lineAngle; + while (boxes.count > 0) { + NSMutableDictionary *currentBox = [boxes[0] mutableCopy]; + CGFloat normalizedAngle = [self normalizeAngle:[currentBox[@"angle"] floatValue]]; + [boxes removeObjectAtIndex:0]; + NSMutableArray *ignoredIdxs = [NSMutableArray array]; + + while (YES) { + //Find all aligned boxes and merge them until max_size is reached or no more boxes can be merged + NSDictionary *fittedLine = [self fitLineToShortestSides:currentBox[@"bbox"]]; + const CGFloat slope = [fittedLine[@"slope"] floatValue]; + const CGFloat intercept = [fittedLine[@"intercept"] floatValue]; + const BOOL isVertical = [fittedLine[@"isVertical"] boolValue]; + + lineAngle = atan(slope) * 180 / M_PI; + if (isVertical){ + lineAngle = -90; + } + + NSDictionary *closestBoxInfo = [self findClosestBox:boxes ignoredIdxs:[NSSet setWithArray:ignoredIdxs] currentBox:currentBox[@"bbox"] isVertical:isVertical m:slope c:intercept centerThreshold:centerThreshold]; + if (closestBoxInfo == nil) break; + + NSInteger candidateIdx = [closestBoxInfo[@"idx"] integerValue]; + NSMutableDictionary *candidateBox = [boxes[candidateIdx] mutableCopy]; + const CGFloat candidateHeight = [closestBoxInfo[@"boxHeight"] floatValue]; + + if (([candidateBox[@"angle"] isEqual: @90] && !isVertical) || ([candidateBox[@"angle"] isEqual: @0] && isVertical)) { + candidateBox[@"bbox"] = [self rotateBox:candidateBox[@"bbox"] withAngle:normalizedAngle]; + } + + const CGFloat minDistance = [self calculateMinimalDistanceBetweenBox:candidateBox[@"bbox"] andBox:currentBox[@"bbox"]]; + const CGFloat mergedHeight = [self minSideLength:currentBox[@"bbox"]]; + if (minDistance < distanceThreshold * candidateHeight && fabs(mergedHeight - candidateHeight) < candidateHeight * heightThreshold) { + currentBox[@"bbox"] = [self mergeRotatedBoxes:currentBox[@"bbox"] withBox:candidateBox[@"bbox"]]; + [boxes removeObjectAtIndex:candidateIdx]; + [ignoredIdxs removeAllObjects]; + if ([self maxSideLength:currentBox[@"bbox"]] > maxWidth){ + break; + } + } else { + [ignoredIdxs addObject:@(candidateIdx)]; + } + } + + [mergedArray addObject:@{@"bbox" : currentBox[@"bbox"], @"angle" : @(lineAngle)}]; + } + + // Remove small boxes and sort by vertical + mergedArray = [self removeSmallBoxesFromArray:mergedArray usingMinSideThreshold:minSideThreshold maxSideThreshold:maxSideThreshold]; + + NSArray *sortedBoxes = [mergedArray sortedArrayUsingComparator:^NSComparisonResult(NSDictionary *obj1, NSDictionary *obj2) { + NSArray *coords1 = obj1[@"bbox"]; + NSArray *coords2 = obj2[@"bbox"]; + const CGFloat minY1 = [self minimumYFromBox:coords1]; + const CGFloat minY2 = [self minimumYFromBox:coords2]; + return (minY1 < minY2) ? NSOrderedAscending : (minY1 > minY2) ? NSOrderedDescending : NSOrderedSame; + }]; + + return sortedBoxes; +} + +@end diff --git a/ios/RnExecutorch/models/ocr/utils/OCRUtils.h b/ios/RnExecutorch/models/ocr/utils/OCRUtils.h new file mode 100644 index 00000000..0304ad37 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/utils/OCRUtils.h @@ -0,0 +1,7 @@ +#import + +@interface OCRUtils : NSObject + ++ (cv::Mat)resizeWithPadding:(cv::Mat)img desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight; + +@end diff --git a/ios/RnExecutorch/models/ocr/utils/OCRUtils.mm b/ios/RnExecutorch/models/ocr/utils/OCRUtils.mm new file mode 100644 index 00000000..3bec6244 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/utils/OCRUtils.mm @@ -0,0 +1,49 @@ +#import "OCRUtils.h" + +@implementation OCRUtils + ++ (cv::Mat)resizeWithPadding:(cv::Mat)img desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight { + const int height = img.rows; + const int width = img.cols; + const float heightRatio = (float)desiredHeight / height; + const float widthRatio = (float)desiredWidth / width; + const float resizeRatio = MIN(heightRatio, widthRatio); + + const int newWidth = width * resizeRatio; + const int newHeight = height * resizeRatio; + + cv::Mat resizedImg; + cv::resize(img, resizedImg, cv::Size(newWidth, newHeight), 0, 0, cv::INTER_AREA); + + const int cornerPatchSize = MAX(1, MIN(height, width) / 30); + std::vector corners = { + img(cv::Rect(0, 0, cornerPatchSize, cornerPatchSize)), + img(cv::Rect(width - cornerPatchSize, 0, cornerPatchSize, cornerPatchSize)), + img(cv::Rect(0, height - cornerPatchSize, cornerPatchSize, cornerPatchSize)), + img(cv::Rect(width - cornerPatchSize, height - cornerPatchSize, cornerPatchSize, cornerPatchSize)) + }; + + cv::Scalar backgroundScalar = cv::mean(corners[0]); + for (int i = 1; i < corners.size(); i++) { + backgroundScalar += cv::mean(corners[i]); + } + backgroundScalar /= (double)corners.size(); + + backgroundScalar[0] = cvFloor(backgroundScalar[0]); + backgroundScalar[1] = cvFloor(backgroundScalar[1]); + backgroundScalar[2] = cvFloor(backgroundScalar[2]); + + const int deltaW = desiredWidth - newWidth; + const int deltaH = desiredHeight - newHeight; + const int top = deltaH / 2; + const int bottom = deltaH - top; + const int left = deltaW / 2; + const int right = deltaW - left; + + cv::Mat centeredImg; + cv::copyMakeBorder(resizedImg, centeredImg, top, bottom, left, right, cv::BORDER_CONSTANT, backgroundScalar); + + return centeredImg; +} + +@end diff --git a/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.h b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.h new file mode 100644 index 00000000..337cdc9f --- /dev/null +++ b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.h @@ -0,0 +1,17 @@ +#import + +@interface RecognizerUtils : NSObject + ++ (CGFloat)calculateRatio:(int)width height:(int)height; ++ (cv::Mat)computeRatioAndResize:(cv::Mat)img width:(int)width height:(int)height modelHeight:(int)modelHeight; ++ (cv::Mat)normalizeForRecognizer:(cv::Mat)image adjustContrast:(double)adjustContrast; ++ (cv::Mat)adjustContrastGrey:(cv::Mat)img target:(double)target; ++ (cv::Mat)divideMatrix:(cv::Mat)matrix byVector:(NSArray *)vector; ++ (cv::Mat)softmax:(cv::Mat)inputs; ++ (NSDictionary *)calculateResizeRatioAndPaddings:(int)width height:(int)height desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight; ++ (cv::Mat)getCroppedImage:(NSDictionary *)box image:(cv::Mat)image modelHeight:(int)modelHeight; ++ (NSMutableArray *)sumProbabilityRows:(cv::Mat)probabilities modelOutputHeight:(int)modelOutputHeight; ++ (NSArray *)findMaxValuesAndIndices:(cv::Mat)probabilities; ++ (double)computeConfidenceScore:(NSArray *)valuesArray indicesArray:(NSArray *)indicesArray; + +@end diff --git a/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm new file mode 100644 index 00000000..74048e20 --- /dev/null +++ b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm @@ -0,0 +1,202 @@ +#import "OCRUtils.h" +#import "RecognizerUtils.h" + +@implementation RecognizerUtils + ++ (CGFloat)calculateRatio:(int)width height:(int)height { + CGFloat ratio = (CGFloat)width / (CGFloat)height; + if (ratio < 1.0) { + ratio = 1.0 / ratio; + } + return ratio; +} + ++ (cv::Mat)computeRatioAndResize:(cv::Mat)img width:(int)width height:(int)height modelHeight:(int)modelHeight { + CGFloat ratio = (CGFloat)width / (CGFloat)height; + if (ratio < 1.0) { + ratio = [self calculateRatio:width height:height]; + cv::resize(img, img, cv::Size(modelHeight, (int)(modelHeight * ratio)), 0, 0, cv::INTER_LANCZOS4); + } else { + cv::resize(img, img, cv::Size((int)(modelHeight * ratio), modelHeight), 0, 0, cv::INTER_LANCZOS4); + } + return img; +} + ++ (cv::Mat)adjustContrastGrey:(cv::Mat)img target:(double)target { + double contrast = 0.0; + int high = 0; + int low = 255; + + for (int i = 0; i < img.rows; ++i) { + for (int j = 0; j < img.cols; ++j) { + uchar pixel = img.at(i, j); + high = MAX(high, pixel); + low = MIN(low, pixel); + } + } + contrast = (high - low) / 255.0; + + if (contrast < target) { + const double ratio = 200.0 / MAX(10, high - low); + img.convertTo(img, CV_32F); + img = ((img - low + 25) * ratio); + + cv::threshold(img, img, 255, 255, cv::THRESH_TRUNC); + cv::threshold(img, img, 0, 0, cv::THRESH_TOZERO); + + img.convertTo(img, CV_8U); + } + + return img; +} + ++ (cv::Mat)normalizeForRecognizer:(cv::Mat)image adjustContrast:(double)adjustContrast { + if (adjustContrast > 0) { + image = [self adjustContrastGrey:image target:adjustContrast]; + } + + int desiredWidth = 128; + if (image.cols >= 512) { + desiredWidth = 512; + } else if (image.cols >= 256) { + desiredWidth = 256; + } + + image = [OCRUtils resizeWithPadding:image desiredWidth:desiredWidth desiredHeight:64]; + + image.convertTo(image, CV_32F, 1.0 / 255.0); + image = (image - 0.5) * 2.0; + + return image; +} + ++ (cv::Mat)divideMatrix:(cv::Mat)matrix byVector:(NSArray *)vector { + cv::Mat result = matrix.clone(); + + for (int i = 0; i < matrix.rows; i++) { + const float divisor = [vector[i] floatValue]; + for (int j = 0; j < matrix.cols; j++) { + result.at(i, j) /= divisor; + } + } + + return result; +} + ++ (cv::Mat)softmax:(cv::Mat) inputs { + cv::Mat maxVal; + cv::reduce(inputs, maxVal, 1, cv::REDUCE_MAX, CV_32F); + cv::Mat expInputs; + cv::exp(inputs - cv::repeat(maxVal, 1, inputs.cols), expInputs); + cv::Mat sumExp; + cv::reduce(expInputs, sumExp, 1, cv::REDUCE_SUM, CV_32F); + cv::Mat softmaxOutput = expInputs / cv::repeat(sumExp, 1, inputs.cols); + return softmaxOutput; +} + ++ (NSDictionary *)calculateResizeRatioAndPaddings:(int)width height:(int)height desiredWidth:(int)desiredWidth desiredHeight:(int)desiredHeight { + const float newRatioH = (float)desiredHeight / height; + const float newRatioW = (float)desiredWidth / width; + float resizeRatio = MIN(newRatioH, newRatioW); + const int newWidth = width * resizeRatio; + const int newHeight = height * resizeRatio; + const int deltaW = desiredWidth - newWidth; + const int deltaH = desiredHeight - newHeight; + const int top = deltaH / 2; + const int left = deltaW / 2; + const float heightRatio = (float)height / desiredHeight; + const float widthRatio = (float)width / desiredWidth; + + resizeRatio = MAX(heightRatio, widthRatio); + + return @{ + @"resizeRatio": @(resizeRatio), + @"top": @(top), + @"left": @(left), + }; +} + ++ (cv::Mat)getCroppedImage:(NSDictionary *)box image:(cv::Mat)image modelHeight:(int)modelHeight { + NSArray *coords = box[@"bbox"]; + const CGFloat angle = [box[@"angle"] floatValue]; + + std::vector points; + for (NSValue *value in coords) { + const CGPoint point = [value CGPointValue]; + points.emplace_back(static_cast(point.x), static_cast(point.y)); + } + + cv::RotatedRect rotatedRect = cv::minAreaRect(points); + + cv::Point2f imageCenter = cv::Point2f(image.cols / 2.0, image.rows / 2.0); + cv::Mat rotationMatrix = cv::getRotationMatrix2D(imageCenter, angle, 1.0); + cv::Mat rotatedImage; + cv::warpAffine(image, rotatedImage, rotationMatrix, image.size(), cv::INTER_LINEAR); + cv::Point2f rectPoints[4]; + rotatedRect.points(rectPoints); + std::vector transformedPoints(4); + cv::Mat rectMat(4, 2, CV_32FC2, rectPoints); + cv::transform(rectMat, rectMat, rotationMatrix); + + for (int i = 0; i < 4; ++i) { + transformedPoints[i] = rectPoints[i]; + } + + cv::Rect boundingBox = cv::boundingRect(transformedPoints); + boundingBox &= cv::Rect(0, 0, rotatedImage.cols, rotatedImage.rows); + cv::Mat croppedImage = rotatedImage(boundingBox); + if (boundingBox.width == 0 || boundingBox.height == 0){ + croppedImage = cv::Mat().empty(); + + return croppedImage; + } + + croppedImage = [self computeRatioAndResize:croppedImage width:boundingBox.width height:boundingBox.height modelHeight:modelHeight]; + + return croppedImage; +} + ++ (NSMutableArray *)sumProbabilityRows:(cv::Mat)probabilities modelOutputHeight:(int)modelOutputHeight { + NSMutableArray *predsNorm = [NSMutableArray arrayWithCapacity:probabilities.rows]; + for (int i = 0; i < probabilities.rows; i++) { + float sum = 0.0; + for (int j = 0; j < modelOutputHeight; j++) { + sum += probabilities.at(i, j); + } + [predsNorm addObject:@(sum)]; + } + return predsNorm; +} + ++ (NSArray *)findMaxValuesAndIndices:(cv::Mat)probabilities { + NSMutableArray *valuesArray = [NSMutableArray array]; + NSMutableArray *indicesArray = [NSMutableArray array]; + for (int i = 0; i < probabilities.rows; i++) { + double maxVal = 0; + cv::Point maxLoc; + cv::minMaxLoc(probabilities.row(i), NULL, &maxVal, NULL, &maxLoc); + [valuesArray addObject:@(maxVal)]; + [indicesArray addObject:@(maxLoc.x)]; + } + return @[valuesArray, indicesArray]; +} + ++ (double)computeConfidenceScore:(NSArray *)valuesArray indicesArray:(NSArray *)indicesArray { + NSMutableArray *predsMaxProb = [NSMutableArray array]; + for (NSUInteger index = 0; index < indicesArray.count; index++) { + NSNumber *indicator = indicesArray[index]; + if ([indicator intValue] != 0) { + [predsMaxProb addObject:valuesArray[index]]; + } + } + if (predsMaxProb.count == 0) { + [predsMaxProb addObject:@(0)]; + } + double product = 1.0; + for (NSNumber *prob in predsMaxProb) { + product *= [prob doubleValue]; + } + return pow(product, 2.0 / sqrt(predsMaxProb.count)); +} + +@end diff --git a/ios/RnExecutorch/utils/Fetcher.h b/ios/RnExecutorch/utils/Fetcher.h index 9d75a574..02334bee 100644 --- a/ios/RnExecutorch/utils/Fetcher.h +++ b/ios/RnExecutorch/utils/Fetcher.h @@ -4,7 +4,8 @@ enum class ResourceType { MODEL, - TOKENIZER + TOKENIZER, + TXT }; inline constexpr unsigned int STATUS_OK = 200; diff --git a/ios/RnExecutorch/utils/Fetcher.mm b/ios/RnExecutorch/utils/Fetcher.mm index 27ae238e..86cc420a 100644 --- a/ios/RnExecutorch/utils/Fetcher.mm +++ b/ios/RnExecutorch/utils/Fetcher.mm @@ -46,6 +46,8 @@ + (BOOL) hasValidExtension:(NSString *)fileName resourceType:(ResourceType)resou return [fileName hasSuffix:@".bin"]; case ResourceType::MODEL: return [fileName hasSuffix:@".pte"]; + case ResourceType::TXT: + return [fileName hasSuffix:@".txt"]; default: return NO; } diff --git a/ios/RnExecutorch/utils/ImageProcessor.h b/ios/RnExecutorch/utils/ImageProcessor.h index 4bb7034e..c65182d0 100644 --- a/ios/RnExecutorch/utils/ImageProcessor.h +++ b/ios/RnExecutorch/utils/ImageProcessor.h @@ -3,8 +3,13 @@ @interface ImageProcessor : NSObject ++ (NSArray *)matToNSArray:(const cv::Mat &)mat + mean:(cv::Scalar)mean + variance:(cv::Scalar)variance; + (NSArray *)matToNSArray:(const cv::Mat &)mat; + (cv::Mat)arrayToMat:(NSArray *)array width:(int)width height:(int)height; ++ (cv::Mat)arrayToMatGray:(NSArray *)array width:(int)width height:(int)height; ++ (NSArray *)matToNSArrayGray:(const cv::Mat &)mat; + (NSString *)saveToTempFile:(const cv::Mat &)image; + (cv::Mat)readImage:(NSString *)source; diff --git a/ios/RnExecutorch/utils/ImageProcessor.mm b/ios/RnExecutorch/utils/ImageProcessor.mm index feab17f6..a8617c26 100644 --- a/ios/RnExecutorch/utils/ImageProcessor.mm +++ b/ios/RnExecutorch/utils/ImageProcessor.mm @@ -4,6 +4,12 @@ @implementation ImageProcessor + (NSArray *)matToNSArray:(const cv::Mat &)mat { + return [ImageProcessor matToNSArray:mat mean:cv::Scalar(0.0, 0.0, 0.0) variance:cv::Scalar(1.0, 1.0, 1.0)]; +} + ++ (NSArray *)matToNSArray:(const cv::Mat &)mat + mean:(cv::Scalar)mean + variance:(cv::Scalar)variance { int pixelCount = mat.cols * mat.rows; NSMutableArray *floatArray = [[NSMutableArray alloc] initWithCapacity:pixelCount * 3]; for (NSUInteger k = 0; k < pixelCount * 3; k++) { @@ -14,14 +20,27 @@ + (NSArray *)matToNSArray:(const cv::Mat &)mat { int row = i / mat.cols; int col = i % mat.cols; cv::Vec3b pixel = mat.at(row, col); - floatArray[0 * pixelCount + i] = @(pixel[2] / 255.0f); - floatArray[1 * pixelCount + i] = @(pixel[1] / 255.0f); - floatArray[2 * pixelCount + i] = @(pixel[0] / 255.0f); + floatArray[0 * pixelCount + i] = @((pixel[0] - mean[0] * 255.0) / (variance[0] * 255.0)); + floatArray[1 * pixelCount + i] = @((pixel[1] - mean[1] * 255.0) / (variance[1] * 255.0)); + floatArray[2 * pixelCount + i] = @((pixel[2] - mean[2] * 255.0) / (variance[2] * 255.0)); } return floatArray; } ++ (NSArray *)matToNSArrayGray:(const cv::Mat &)mat { + NSMutableArray *pixelArray = [[NSMutableArray alloc] initWithCapacity:mat.cols * mat.rows]; + + for (int row = 0; row < mat.rows; row++) { + for (int col = 0; col < mat.cols; col++) { + float pixelValue = mat.at(row, col); + [pixelArray addObject:@(pixelValue)]; + } + } + + return pixelArray; +} + + (cv::Mat)arrayToMat:(NSArray *)array width:(int)width height:(int)height { cv::Mat mat(height, width, CV_8UC3); @@ -42,6 +61,20 @@ + (NSArray *)matToNSArray:(const cv::Mat &)mat { return mat; } ++ (cv::Mat)arrayToMatGray:(NSArray *)array width:(int)width height:(int)height { + cv::Mat mat(height, width, CV_32F); + + int pixelCount = width * height; + for (int i = 0; i < pixelCount; i++) { + int row = i / width; + int col = i % width; + float value = [array[i] floatValue]; + mat.at(row, col) = value; + } + + return mat; +} + + (NSString *)saveToTempFile:(const cv::Mat&)image { NSString *uniqueID = [[NSUUID UUID] UUIDString]; NSString *filename = [NSString stringWithFormat:@"rn_executorch_%@.png", uniqueID]; @@ -65,9 +98,9 @@ + (NSString *)saveToTempFile:(const cv::Mat&)image { //base64 NSArray *parts = [source componentsSeparatedByString:@","]; if ([parts count] < 2) { - @throw [NSException exceptionWithName:@"readImage_error" - reason:[NSString stringWithFormat:@"%ld", (long)InvalidArgument] - userInfo:nil]; + @throw [NSException exceptionWithName:@"readImage_error" + reason:[NSString stringWithFormat:@"%ld", (long)InvalidArgument] + userInfo:nil]; } NSString *encodedString = parts[1]; NSData *data = [[NSData alloc] initWithBase64EncodedString:encodedString options:NSDataBase64DecodingIgnoreUnknownCharacters]; diff --git a/src/Error.ts b/src/Error.ts index 76785639..955b62a9 100644 --- a/src/Error.ts +++ b/src/Error.ts @@ -4,6 +4,7 @@ export enum ETError { ModuleNotLoaded = 0x66, FileWriteFailed = 0x67, ModelGenerating = 0x68, + LanguageNotSupported = 0x69, InvalidModelSource = 0xff, // ExecuTorch mapped errors diff --git a/src/OCR.ts b/src/OCR.ts new file mode 100644 index 00000000..17c4aafc --- /dev/null +++ b/src/OCR.ts @@ -0,0 +1,114 @@ +import { useEffect, useState } from 'react'; +import { ResourceSource } from './types/common'; +import { OCR } from './native/RnExecutorchModules'; +import { ETError, getError } from './Error'; +import { Image } from 'react-native'; +import { OCRDetection } from './types/ocr'; +import { symbols } from './constants/ocr/symbols'; +import { languageDicts } from './constants/ocr/languageDicts'; + +interface OCRModule { + error: string | null; + isReady: boolean; + isGenerating: boolean; + forward: (input: string) => Promise; +} + +const getResourcePath = (source: ResourceSource) => { + if (typeof source === 'number') { + return Image.resolveAssetSource(source).uri; + } + return source; +}; + +export const useOCR = ({ + detectorSource, + recognizerSources, + language = 'en', +}: { + detectorSource: ResourceSource; + recognizerSources: { + recognizerLarge: ResourceSource; + recognizerMedium: ResourceSource; + recognizerSmall: ResourceSource; + }; + language?: string; +}): OCRModule => { + const [error, setError] = useState(null); + const [isReady, setIsReady] = useState(false); + const [isGenerating, setIsGenerating] = useState(false); + + useEffect(() => { + const loadModel = async () => { + if (!detectorSource || Object.keys(recognizerSources).length === 0) + return; + + const detectorPath = getResourcePath(detectorSource); + const recognizerPaths = {} as { + recognizerLarge: string; + recognizerMedium: string; + recognizerSmall: string; + }; + + if (!symbols[language] || !languageDicts[language]) { + setError(getError(ETError.LanguageNotSupported)); + return; + } + + for (const key in recognizerSources) { + if (recognizerSources.hasOwnProperty(key)) { + recognizerPaths[key as keyof typeof recognizerPaths] = + getResourcePath( + recognizerSources[key as keyof typeof recognizerSources] + ); + } + } + + const languageDictPath = getResourcePath(languageDicts[language]); + + try { + setIsReady(false); + await OCR.loadModule( + detectorPath, + recognizerPaths.recognizerLarge, + recognizerPaths.recognizerMedium, + recognizerPaths.recognizerSmall, + symbols[language], + languageDictPath + ); + setIsReady(true); + } catch (e) { + setError(getError(e)); + } + }; + + loadModel(); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [detectorSource, language, JSON.stringify(recognizerSources)]); + + const forward = async (input: string) => { + if (!isReady) { + throw new Error(getError(ETError.ModuleNotLoaded)); + } + if (isGenerating) { + throw new Error(getError(ETError.ModelGenerating)); + } + + try { + setIsGenerating(true); + const output = await OCR.forward(input); + return output; + } catch (e) { + throw new Error(getError(e)); + } finally { + setIsGenerating(false); + } + }; + + return { + error, + isReady, + isGenerating, + forward, + }; +}; diff --git a/src/constants/ocr/languageDicts.ts b/src/constants/ocr/languageDicts.ts new file mode 100644 index 00000000..fcd189b5 --- /dev/null +++ b/src/constants/ocr/languageDicts.ts @@ -0,0 +1,4 @@ +export const languageDicts: { [key: string]: string } = { + en: 'https://huggingface.co/nklockiewicz/ocr/resolve/main/en.txt', + pl: 'https://huggingface.co/nklockiewicz/ocr/resolve/main/pl.txt', +}; diff --git a/src/constants/ocr/symbols.ts b/src/constants/ocr/symbols.ts new file mode 100644 index 00000000..229c0613 --- /dev/null +++ b/src/constants/ocr/symbols.ts @@ -0,0 +1,4 @@ +export const symbols: { [key: string]: string } = { + en: '0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ €ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', + pl: ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ªÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿĀāĂ㥹ĆćČčĎďĐđĒēĖėĘęĚěĞğĨĩĪīĮįİıĶķĹĺĻļĽľŁłŃńŅņŇňŒœŔŕŘřŚśŞşŠšŤťŨũŪūŮůŲųŸŹźŻżŽžƏƠơƯưȘșȚțə̇ḌḍḶḷṀṁṂṃṄṅṆṇṬṭẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ€', +}; diff --git a/src/index.tsx b/src/index.tsx index 74cfd13e..429b5061 100644 --- a/src/index.tsx +++ b/src/index.tsx @@ -1,7 +1,9 @@ export * from './ETModule'; export * from './LLM'; +export * from './OCR'; export * from './constants/modelUrls'; export * from './models/Classification'; export * from './models/ObjectDetection'; export * from './models/StyleTransfer'; export * from './types/object_detection'; +export * from './types/ocr'; diff --git a/src/native/NativeOCR.ts b/src/native/NativeOCR.ts new file mode 100644 index 00000000..305bf012 --- /dev/null +++ b/src/native/NativeOCR.ts @@ -0,0 +1,17 @@ +import type { TurboModule } from 'react-native'; +import { TurboModuleRegistry } from 'react-native'; +import { OCRDetection } from '../types/ocr'; + +export interface Spec extends TurboModule { + loadModule( + detectorSource: string, + recognizerSourceLarge: string, + recognizerSourceMedium: string, + recognizerSourceSmall: string, + symbols: string, + languageDictPath: string + ): Promise; + forward(input: string): Promise; +} + +export default TurboModuleRegistry.get('OCR'); diff --git a/src/native/RnExecutorchModules.ts b/src/native/RnExecutorchModules.ts index 8a80b595..925ec09b 100644 --- a/src/native/RnExecutorchModules.ts +++ b/src/native/RnExecutorchModules.ts @@ -71,6 +71,19 @@ const StyleTransfer = StyleTransferSpec } ); +const OCRSpec = require('./NativeOCR').default; + +const OCR = OCRSpec + ? OCRSpec + : new Proxy( + {}, + { + get() { + throw new Error(LINKING_ERROR); + }, + } + ); + class _ObjectDetectionModule { async forward(input: string) { return await ObjectDetection.forward(input); @@ -120,6 +133,7 @@ export { Classification, ObjectDetection, StyleTransfer, + OCR, _ETModule, _ClassificationModule, _StyleTransferModule, diff --git a/src/types/ocr.ts b/src/types/ocr.ts new file mode 100644 index 00000000..f5f2e6d3 --- /dev/null +++ b/src/types/ocr.ts @@ -0,0 +1,10 @@ +export interface OCRDetection { + bbox: OCRBbox[]; + text: string; + score: number; +} + +export interface OCRBbox { + x: number; + y: number; +} diff --git a/src/useModule.ts b/src/useModule.ts index 66c2fd49..45e58c7b 100644 --- a/src/useModule.ts +++ b/src/useModule.ts @@ -1,7 +1,7 @@ import { useEffect, useState } from 'react'; import { Image } from 'react-native'; import { ETError, getError } from './Error'; -import { ETInput, module } from './types/common'; +import { ETInput, module, ResourceSource } from './types/common'; const getTypeIdentifier = (arr: ETInput): number => { if (arr instanceof Int8Array) return 0; @@ -14,7 +14,7 @@ const getTypeIdentifier = (arr: ETInput): number => { }; interface Props { - modelSource: string | number; + modelSource: ResourceSource; module: module; }