Overview

I had an opportunity to obtain the coordinates within a larger image from multiple cropped sub-images. This article is a memo summarizing the method for doing this.

I introduce a method using OpenCV’s SIFT (Scale-Invariant Feature Transform) to perform feature point matching between template images and the original image, estimate the affine transformation, and obtain the coordinates.

Implementation

Required Libraries

pip install opencv-python numpy tqdm

Python Code

The following code matches template images (PNG images in templates_dir) against a specified large image (image_path) using SIFT, and obtains the coordinates within the original image.

import cv2
import numpy as np
from glob import glob
from tqdm import tqdm
import os

# Load image in grayscale
def load_image_gray(path):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        print(f"Image not found: {path}")
    return img

# Extract features
def extract_features(image, detector):
    return detector.detectAndCompute(image, None)

# Matching process
def match_features(des1, des2, matcher, ratio_test=0.7, min_matches=4):
    matches = matcher.knnMatch(des1, des2, k=2)
    good_matches = [m for m, n in matches if m.distance < ratio_test * n.distance]
    return good_matches if len(good_matches) >= min_matches else None

# Affine transformation estimation
def estimate_affine_transform(kp1, kp2, good_matches):
    src_pts = np.float32([kp2[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)
    dst_pts = np.float32([kp1[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
    M_affine, _ = cv2.estimateAffinePartial2D(src_pts, dst_pts, method=cv2.RANSAC, ransacReprojThreshold=5.0)
    return M_affine

# Draw matched rectangle on image
def draw_matched_rectangle(image, M_affine, templ_shape):
    h, w = templ_shape
    rect_pts = np.float32([[0, 0], [w, 0], [w, h], [0, h]])  # Four corners of the rectangle
    transformed_pts = cv2.transform(np.array([rect_pts]), M_affine)[0]  # Transformed coordinates
    cv2.polylines(image, [np.int32(transformed_pts)], isClosed=True, color=(0, 0, 255), thickness=2)
    return transformed_pts

# Main processing
def main(image_path, templates_dir, output_path):
    # Load image and template list
    img = load_image_gray(image_path)
    templ_paths = glob(templates_dir)
    dst_img = cv2.imread(image_path)

    # SIFT feature detector & BFMatcher setup
    sift = cv2.SIFT_create()
    bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=False)
    kp1, des1 = extract_features(img, sift)

    # If no features found
    if des1 is None:
        print("No features found in the target image.")
        return

    for templ_path in tqdm(templ_paths):
        templ = load_image_gray(templ_path)
        if templ is None:
            continue

        kp2, des2 = extract_features(templ, sift)
        if des2 is None:
            continue

        good_matches = match_features(des1, des2, bf)
        if good_matches is None:
            print(f"Insufficient feature matches: {templ_path}")
            continue

        # Affine transformation estimation
        M_affine = estimate_affine_transform(kp1, kp2, good_matches)
        if M_affine is None:
            print(f"Affine transformation estimation failed: {templ_path}")
            continue

        # Draw rectangle
        best_dst = draw_matched_rectangle(dst_img, M_affine, templ.shape)

        # Display filename near the rectangle
        x, y, _, _ = cv2.boundingRect(best_dst)
        base_name = os.path.splitext(os.path.basename(templ_path))[0]
        cv2.putText(dst_img, base_name, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Save result
    cv2.imwrite(output_path, dst_img)
    print(f"Result image saved: {output_path}")

Execution

# Execution
if __name__ == "__main__":
    # Parameter settings
    IMAGE_PATH = "/xxx/default.jpg"
    TEMPLATES_DIR = "/xxx/*.png"
    OUTPUT_PATH = "/xxx/match_result.jpg"
    main(IMAGE_PATH, TEMPLATES_DIR, OUTPUT_PATH)

Summary

This article introduced a method for estimating where sub-images are located in the original image using SIFT-based feature point matching, and identifying positions through affine transformation.

  • SIFT is used for feature extraction (freely available since OpenCV 4.4)
  • BFMatcher is used for feature matching, and RANSAC is used for noise removal
  • Affine transformation is used to estimate coordinates and draw rectangles on the original image
  • Result images are saved for visualization of where each sub-image is located

This method can be applied to tasks such as locating partial images in historical maps, OCR region detection, and image comparison.

Future challenges:

  • Correction for rotated images
  • Consideration of faster algorithms than SIFT (ORB, AKAZE, etc.)
  • Processing speed optimization (feature point filtering)

There may be some incomplete points, but I hope this serves as a useful reference.