diff --git a/models/text_detection_db/CMakeLists.txt b/models/text_detection_db/CMakeLists.txt new file mode 100644 index 00000000..1a0ef059 --- /dev/null +++ b/models/text_detection_db/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.24) +set(project_name "opencv_zoo_text_detection_db") + +PROJECT (${project_name}) + +set(OPENCV_VERSION "4.7.0") +set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation") +find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH}) +# Find OpenCV, you may need to set OpenCV_DIR variable +# to the absolute path to the directory containing OpenCVConfig.cmake file +# via the command line or GUI + +file(GLOB SourceFile + "demo.cpp") +# If the package has been found, several variables will +# be set, you can find the full list with descriptions +# in the OpenCVConfig.cmake file. +# Print some message showing some of them +message(STATUS "OpenCV library status:") +message(STATUS " config: ${OpenCV_DIR}") +message(STATUS " version: ${OpenCV_VERSION}") +message(STATUS " libraries: ${OpenCV_LIBS}") +message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") + +# Declare the executable target built from your sources +add_executable(${project_name} ${SourceFile}) + +# Link your application with OpenCV libraries +target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS}) diff --git a/models/text_detection_db/README.md b/models/text_detection_db/README.md index 52a5ec68..89d505b9 100644 --- a/models/text_detection_db/README.md +++ b/models/text_detection_db/README.md @@ -11,6 +11,8 @@ Note: ## Demo +### Python + Run the following command to try the demo: ```shell @@ -23,6 +25,22 @@ python demo.py --input /path/to/image -v python demo.py --help ``` +### C++ + +Install latest OpenCV and CMake >= 3.24.0 to get started with: + +```shell +# A typical and default installation path of OpenCV is /usr/local +cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation . +cmake --build build +# detect on camera input +./build/opencv_zoo_text_detection_db -m=/path/to/model +# detect on an image +./build/opencv_zoo_text_detection_db -m=/path/to/model -i=/path/to/image -v +# get help messages +./build/opencv_zoo_text_detection_db -h +``` + ### Example outputs ![mask](./example_outputs/mask.jpg) diff --git a/models/text_detection_db/demo.cpp b/models/text_detection_db/demo.cpp new file mode 100644 index 00000000..dd707b49 --- /dev/null +++ b/models/text_detection_db/demo.cpp @@ -0,0 +1,179 @@ +#include + +#include +#include +#include + +using namespace std; +using namespace cv; +using namespace dnn; + +vector< pair > backendTargetPairs = { + std::make_pair(dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU), + std::make_pair(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA), + std::make_pair(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16), + std::make_pair(dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU), + std::make_pair(dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU)}; + + +std::string keys = +"{ help h | | Print help message. }" +"{ model m | text_detection_DB_IC15_resnet18_2021sep.onnx | Usage: Set model type, defaults to text_detection_DB_IC15_resnet18_2021sep.onnx }" +"{ input i | | Usage: Path to input image or video file. Skip this argument to capture frames from a camera.}" +"{ width | 736 | Usage: Resize input image to certain width, default = 736. It should be multiple by 32.}" +"{ height | 736 | Usage: Resize input image to certain height, default = 736. It should be multiple by 32.}" +"{ binary_threshold | 0.3 | Usage: Threshold of the binary map, default = 0.3.}" +"{ polygon_threshold | 0.5 | Usage: Threshold of polygons, default = 0.5.}" +"{ max_candidates | 200 | Usage: Set maximum number of polygon candidates, default = 200.}" +"{ unclip_ratio | 2.0 | Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.}" +"{ save s | true | Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.}" +"{ viz v | true | Usage: Specify to open a new window to show results. Invalid in case of camera input.}" +"{ backend bt | 0 | Choose one of computation backends: " +"0: (default) OpenCV implementation + CPU, " +"1: CUDA + GPU (CUDA), " +"2: CUDA + GPU (CUDA FP16), " +"3: TIM-VX + NPU, " +"4: CANN + NPU}"; + + +class DB { +public: + + DB(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3, + float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0, + dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh), + polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio), + backendId(bId), targetId(tId) + { + this->model = TextDetectionModel_DB(readNet(modelPath)); + this->model.setPreferableBackend(backendId); + this->model.setPreferableTarget(targetId); + + this->model.setBinaryThreshold(binaryThreshold); + this->model.setPolygonThreshold(polygonThreshold); + this->model.setUnclipRatio(unclipRatio); + this->model.setMaxCandidates(maxCandidates); + + this->model.setInputParams(1.0 / 255.0, inputSize, Scalar(122.67891434, 116.66876762, 104.00698793)); + } + pair< vector>, vector > infer(Mat image) { + CV_Assert(image.rows == this->inputSize.height && "height of input image != net input size "); + CV_Assert(image.cols == this->inputSize.width && "width of input image != net input size "); + vector> pt; + vector confidence; + this->model.detect(image, pt, confidence); + return make_pair< vector> &, vector< float > &>(pt, confidence); + } + +private: + string modelPath; + TextDetectionModel_DB model; + Size inputSize; + float binaryThreshold; + float polygonThreshold; + int maxCandidates; + double unclipRatio; + dnn::Backend backendId; + dnn::Target targetId; + +}; + +Mat visualize(Mat image, pair< vector>, vector >&results, double fps=-1, Scalar boxColor=Scalar(0, 255, 0), Scalar textColor=Scalar(0, 0, 255), bool isClosed=true, int thickness=2) +{ + Mat output; + image.copyTo(output); + if (fps > 0) + putText(output, format("FPS: %.2f", fps), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor); + polylines(output, results.first, isClosed, boxColor, thickness); + return output; +} + +int main(int argc, char** argv) +{ + CommandLineParser parser(argc, argv, keys); + + parser.about("Use this program to run Real-time Scene Text Detection with Differentiable Binarization in opencv Zoo using OpenCV."); + if (parser.has("help")) + { + parser.printMessage(); + return 0; + } + + int backendTargetid = parser.get("backend"); + String modelName = parser.get("model"); + + if (modelName.empty()) + { + CV_Error(Error::StsError, "Model file " + modelName + " not found"); + } + + Size inpSize(parser.get("width"), parser.get("height")); + float binThresh = parser.get("binary_threshold"); + float polyThresh = parser.get("polygon_threshold"); + int maxCand = parser.get("max_candidates"); + double unRatio = parser.get("unclip_ratio"); + bool save = parser.get("save"); + bool viz = parser.get("viz"); + + DB model(modelName, inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second); + + //! [Open a video file or an image file or a camera stream] + VideoCapture cap; + if (parser.has("input")) + cap.open(parser.get("input")); + else + cap.open(0); + if (!cap.isOpened()) + CV_Error(Error::StsError, "Cannot opend video or file"); + Mat originalImage; + static const std::string kWinName = modelName; + while (waitKey(1) < 0) + { + cap >> originalImage; + if (originalImage.empty()) + { + cout << "Frame is empty" << endl; + waitKey(); + break; + } + int originalW = originalImage.cols; + int originalH = originalImage.rows; + double scaleHeight = originalH / double(inpSize.height); + double scaleWidth = originalW / double(inpSize.width); + Mat image; + resize(originalImage, image, inpSize); + + // inference + TickMeter tm; + tm.start(); + pair< vector>, vector > results = model.infer(image); + tm.stop(); + auto x = results.first; + // Scale the results bounding box + for (auto &pts : results.first) + { + for (int i = 0; i < 4; i++) + { + pts[i].x = int(pts[i].x * scaleWidth); + pts[i].y = int(pts[i].y * scaleHeight); + } + } + originalImage = visualize(originalImage, results, tm.getFPS()); + tm.reset(); + if (parser.has("input")) + { + if (save) + { + cout << "Result image saved to result.jpg\n"; + imwrite("result.jpg", originalImage); + } + if (viz) + imshow(kWinName, originalImage); + } + else + imshow(kWinName, originalImage); + } + return 0; +} + +