Module data_handling.utils

Utility functions for data handling like: 1. Drawing annotations 2. Getting stats. 3. Removing categories or creating subset of dataset of particular categories. 4. Merging datasets.

Expand source code
"""Utility functions for data handling like:
1. Drawing annotations
2. Getting stats.
3. Removing categories or creating subset of dataset of particular categories.
4. Merging datasets."""

from pycocotools.coco import COCO
import json
import os
import cv2
from tqdm import tqdm
from .coco_assistant import coco_assistant
import random
import shutil
import colorsys
from pathlib import Path

def __generate_colors(num_classes):
    # generate distict random colors
    hsv_tuples = [(x / num_classes, 1., 1.) for x in range(num_classes)]
    colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
    colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors))
    random.seed(10101)  # Fixed seed for consistent colors across runs.
    random.shuffle(colors)  # Shuffle colors to decorrelate adjacent classes.
    random.seed(None)  # Reset seed to default.
    return colors 

def draw_annotations(image_dir, annotation_file, output_dir):
    """Draw annotations on images with coco annotation file

        image_dir (str): Path to input image directory
        annotation_file (str): Path to annotation file
        output_dir (str): Path to output image directory
    font_scale = 1

    if not os.path.exists(output_dir):

    coco = COCO(annotation_file)
    categories = coco.loadCats(coco.getCatIds())
    category_dict = {}
    for category in categories:
        category_dict[category['id']] = category['name']

    colors_list = __generate_colors(max(coco.getCatIds())+1)

    img_ids = coco.getImgIds()
    cat_ids = coco.getCatIds()
    for img_id in tqdm(img_ids, desc='Drawing on Images'):
        img_id_list = [img_id]
        img = coco.loadImgs(img_id_list)
        image = cv2.imread(os.path.join(image_dir, img[0]['file_name']))
        ann_ids = coco.getAnnIds(imgIds=img_id_list, catIds=cat_ids)
        annotation_list = coco.loadAnns(ann_ids)
        for annotation in annotation_list:
            boxes = annotation['bbox']
            boxes[2] = boxes[0] + boxes[2]
            boxes[3] = boxes[1] + boxes[3]
            x0 = int(boxes[0])
            y0 = int(boxes[1])
            x1 = int(boxes[2])
            y1 = int(boxes[3])
            cv2.rectangle(image, (x0, y0), (x1, y1), colors_list[annotation['category_id']], 2)
            (_, _), baseline = cv2.getTextSize(
            cv2.putText(image, category_dict[annotation['category_id']],
                        (x0, y0 - baseline),
                        font_scale, colors_list[annotation['category_id']], 2)
        cv2.imwrite(os.path.join(output_dir, img[0]['file_name']), image)

def annotation_stats(annotation_file):
    """Print stats like number of instances and number of images in that category

        annotation_file (str): coco json annotation file path
    coco = COCO(annotation_file)
    print("\n%-30s %-5s      %-15s %s" % ("Category", "ID", "Instances", "Image Count"))
    for cat_id in coco.getCatIds():
        print("%-30s %-5d ---> %-15d %d" % 
            (coco.cats[cat_id]['name'], cat_id, len(coco.getAnnIds(catIds=[cat_id])), len(coco.getImgIds(catIds=[cat_id]))))

def remove_categories(annotation_file, categories, output_annotation_file, keep_blank_images=True):
    """Remove the categories from the annotation file

        annotation_file (str): path to annotation file
        categories (list): list of categories to remove
        output_annotation_file (str): path to output annotation file
        keep_blank_images (bool): keep images with no annotations
    data = json.load(open(annotation_file))
    categories_remove = [category['id'] for category in data['categories'] if category['name'] in categories]
    data['categories'] = [category for category in data['categories'] if category['name'] not in categories]
    category_images_ids = []
    image_ids = []
    annotation_list = []
    for annotation in data['annotations']:
        if annotation['category_id'] in categories_remove:
    if not keep_blank_images:
        image_ids_remove = list(set(category_images_ids) - set(image_ids))
        data['images'] = [image for image in data['images'] if image['id'] not in image_ids_remove]
    data['annotations'] = annotation_list

    with open(output_annotation_file, 'w') as f:
        json.dump(data, f)

def category_subset(image_folder, annotation_file, input_cat, output_folder="categories", max_img_per_label=-1):
    """Subset the annotation file to only include the categories specified

        image_folder (str): Path to image folder
        annotation_file (str): Path to annotation file
        input_cat (list): List of categories to include
        output_folder (str): Path to output folder. Defaults to "categories"
        max_img_per_label (int): Maximum number of images per label. Defaults to -1, then all images are included.
    if not os.path.exists(output_folder):
    if not os.path.exists(os.path.join(output_folder, "images")):
        os.makedirs(os.path.join(output_folder, "images"))
    if not os.path.exists(os.path.join(output_folder, "annotations")):  
        os.makedirs(os.path.join(output_folder, "annotations")) 

    resim_dir = Path(os.path.join(output_folder, "images"))
    resann_dir = Path(os.path.join(output_folder, "annotations"))
    dst_ann = Path(os.path.join(resann_dir, "annotations.json"))
    coco = COCO(annotation_file)
    categories = coco.loadCats(coco.getCatIds())
    category_dict = {}
    cat_Ids = []
    for category in categories:
        if category['name'] in input_cat:
        category_dict[category['name']] = category['id']
    img_id_list = []
    category_list = coco.loadCats(ids=cat_Ids)
    for cat_id in tqdm(cat_Ids, desc='Category data selection'):
        img_ids = coco.getImgIds(catIds=[cat_id])
        if max_img_per_label > 0 and len(img_ids) > max_img_per_label:
            img_ids = [img_ids[i] for i in sorted(
                random.sample(range(len(img_ids)), max_img_per_label))]
        img_id_list = img_id_list + list(set(img_ids) - set(img_id_list))
    images = coco.loadImgs(ids=img_id_list)
    ann_ids = coco.getAnnIds(catIds=cat_Ids, imgIds=img_id_list)
    annotation_list = coco.loadAnns(ann_ids)
    for image in tqdm(images, desc='Copying images'):
        source = os.path.join(image_folder, image['file_name'])
        destination = resim_dir / image['file_name']
        shutil.copyfile(source, destination)
    coco_data = json.load(open(annotation_file))
    coco_data['categories'] = category_list
    coco_data['annotations'] = annotation_list
    coco_data['images'] = images
    with open(dst_ann, 'w') as outfile:
        json.dump(coco_data, outfile)

def merge_datasets(image_folder, annotation_folder, output_folder="merged", merge_images=True, duplicate_frames=True):
    """Merge multiple datasets into a single dataset

        image_folder (str): Path to image folder
        annotation_folder (str): Path to annotation folder
        output_folder (str): Path to output folder. Defaults to "merged"
        merge_images (bool): Merge images. Defaults to True
        duplicate_frames (bool): Duplicate frames. Defaults to True
    coco_obj = coco_assistant.COCO_Assistant(image_folder, annotation_folder)
    coco_obj.merge(output_folder, merge_images, duplicate_frames)

def merge_annotations(image_folder, annotation_folder, output_folder="merged", merge_images=True):
    """Merge multiple annotations of single image into one file.
        image_folder (str): Path to image folder
        annotation_folder (str): Path to annotation folder
        output_folder (str): Path to output folder. Defaults to "merged"
        merge_images (bool): Merge images. Defaults to True
    coco_obj = coco_assistant.COCO_Assistant(image_folder, annotation_folder)
    coco_obj.merge_same(output_folder, merge_images)

if __name__ == "__main__":
    import sys
    # draw_annotations(sys.argv[1], sys.argv[2], sys.argv[3])


def annotation_stats(annotation_file)

Print stats like number of instances and number of images in that category


annotation_file : str
coco json annotation file path
Expand source code
def annotation_stats(annotation_file):
    """Print stats like number of instances and number of images in that category

        annotation_file (str): coco json annotation file path
    coco = COCO(annotation_file)
    print("\n%-30s %-5s      %-15s %s" % ("Category", "ID", "Instances", "Image Count"))
    for cat_id in coco.getCatIds():
        print("%-30s %-5d ---> %-15d %d" % 
            (coco.cats[cat_id]['name'], cat_id, len(coco.getAnnIds(catIds=[cat_id])), len(coco.getImgIds(catIds=[cat_id]))))
def category_subset(image_folder, annotation_file, input_cat, output_folder='categories', max_img_per_label=-1)

Subset the annotation file to only include the categories specified


image_folder : str
Path to image folder
annotation_file : str
Path to annotation file
input_cat : list
List of categories to include
output_folder : str
Path to output folder. Defaults to "categories"
max_img_per_label : int
Maximum number of images per label. Defaults to -1, then all images are included.
Expand source code
def category_subset(image_folder, annotation_file, input_cat, output_folder="categories", max_img_per_label=-1):
    """Subset the annotation file to only include the categories specified

        image_folder (str): Path to image folder
        annotation_file (str): Path to annotation file
        input_cat (list): List of categories to include
        output_folder (str): Path to output folder. Defaults to "categories"
        max_img_per_label (int): Maximum number of images per label. Defaults to -1, then all images are included.
    if not os.path.exists(output_folder):
    if not os.path.exists(os.path.join(output_folder, "images")):
        os.makedirs(os.path.join(output_folder, "images"))
    if not os.path.exists(os.path.join(output_folder, "annotations")):  
        os.makedirs(os.path.join(output_folder, "annotations")) 

    resim_dir = Path(os.path.join(output_folder, "images"))
    resann_dir = Path(os.path.join(output_folder, "annotations"))
    dst_ann = Path(os.path.join(resann_dir, "annotations.json"))
    coco = COCO(annotation_file)
    categories = coco.loadCats(coco.getCatIds())
    category_dict = {}
    cat_Ids = []
    for category in categories:
        if category['name'] in input_cat:
        category_dict[category['name']] = category['id']
    img_id_list = []
    category_list = coco.loadCats(ids=cat_Ids)
    for cat_id in tqdm(cat_Ids, desc='Category data selection'):
        img_ids = coco.getImgIds(catIds=[cat_id])
        if max_img_per_label > 0 and len(img_ids) > max_img_per_label:
            img_ids = [img_ids[i] for i in sorted(
                random.sample(range(len(img_ids)), max_img_per_label))]
        img_id_list = img_id_list + list(set(img_ids) - set(img_id_list))
    images = coco.loadImgs(ids=img_id_list)
    ann_ids = coco.getAnnIds(catIds=cat_Ids, imgIds=img_id_list)
    annotation_list = coco.loadAnns(ann_ids)
    for image in tqdm(images, desc='Copying images'):
        source = os.path.join(image_folder, image['file_name'])
        destination = resim_dir / image['file_name']
        shutil.copyfile(source, destination)
    coco_data = json.load(open(annotation_file))
    coco_data['categories'] = category_list
    coco_data['annotations'] = annotation_list
    coco_data['images'] = images
    with open(dst_ann, 'w') as outfile:
        json.dump(coco_data, outfile)
def draw_annotations(image_dir, annotation_file, output_dir)

Draw annotations on images with coco annotation file


image_dir : str
Path to input image directory
annotation_file : str
Path to annotation file
output_dir : str
Path to output image directory
Expand source code
def draw_annotations(image_dir, annotation_file, output_dir):
    """Draw annotations on images with coco annotation file

        image_dir (str): Path to input image directory
        annotation_file (str): Path to annotation file
        output_dir (str): Path to output image directory
    font_scale = 1

    if not os.path.exists(output_dir):

    coco = COCO(annotation_file)
    categories = coco.loadCats(coco.getCatIds())
    category_dict = {}
    for category in categories:
        category_dict[category['id']] = category['name']

    colors_list = __generate_colors(max(coco.getCatIds())+1)

    img_ids = coco.getImgIds()
    cat_ids = coco.getCatIds()
    for img_id in tqdm(img_ids, desc='Drawing on Images'):
        img_id_list = [img_id]
        img = coco.loadImgs(img_id_list)
        image = cv2.imread(os.path.join(image_dir, img[0]['file_name']))
        ann_ids = coco.getAnnIds(imgIds=img_id_list, catIds=cat_ids)
        annotation_list = coco.loadAnns(ann_ids)
        for annotation in annotation_list:
            boxes = annotation['bbox']
            boxes[2] = boxes[0] + boxes[2]
            boxes[3] = boxes[1] + boxes[3]
            x0 = int(boxes[0])
            y0 = int(boxes[1])
            x1 = int(boxes[2])
            y1 = int(boxes[3])
            cv2.rectangle(image, (x0, y0), (x1, y1), colors_list[annotation['category_id']], 2)
            (_, _), baseline = cv2.getTextSize(
            cv2.putText(image, category_dict[annotation['category_id']],
                        (x0, y0 - baseline),
                        font_scale, colors_list[annotation['category_id']], 2)
        cv2.imwrite(os.path.join(output_dir, img[0]['file_name']), image)
def merge_annotations(image_folder, annotation_folder, output_folder='merged', merge_images=True)

Merge multiple annotations of single image into one file.


image_folder : str
Path to image folder
annotation_folder : str
Path to annotation folder
output_folder : str
Path to output folder. Defaults to "merged"
merge_images : bool
Merge images. Defaults to True
Expand source code
def merge_annotations(image_folder, annotation_folder, output_folder="merged", merge_images=True):
    """Merge multiple annotations of single image into one file.
        image_folder (str): Path to image folder
        annotation_folder (str): Path to annotation folder
        output_folder (str): Path to output folder. Defaults to "merged"
        merge_images (bool): Merge images. Defaults to True
    coco_obj = coco_assistant.COCO_Assistant(image_folder, annotation_folder)
    coco_obj.merge_same(output_folder, merge_images)
def merge_datasets(image_folder, annotation_folder, output_folder='merged', merge_images=True, duplicate_frames=True)

Merge multiple datasets into a single dataset


image_folder : str
Path to image folder
annotation_folder : str
Path to annotation folder
output_folder : str
Path to output folder. Defaults to "merged"
merge_images : bool
Merge images. Defaults to True
duplicate_frames : bool
Duplicate frames. Defaults to True
Expand source code
def merge_datasets(image_folder, annotation_folder, output_folder="merged", merge_images=True, duplicate_frames=True):
    """Merge multiple datasets into a single dataset

        image_folder (str): Path to image folder
        annotation_folder (str): Path to annotation folder
        output_folder (str): Path to output folder. Defaults to "merged"
        merge_images (bool): Merge images. Defaults to True
        duplicate_frames (bool): Duplicate frames. Defaults to True
    coco_obj = coco_assistant.COCO_Assistant(image_folder, annotation_folder)
    coco_obj.merge(output_folder, merge_images, duplicate_frames)
def remove_categories(annotation_file, categories, output_annotation_file, keep_blank_images=True)

Remove the categories from the annotation file


annotation_file : str
path to annotation file
categories : list
list of categories to remove
output_annotation_file : str
path to output annotation file
keep_blank_images : bool
keep images with no annotations
Expand source code
def remove_categories(annotation_file, categories, output_annotation_file, keep_blank_images=True):
    """Remove the categories from the annotation file

        annotation_file (str): path to annotation file
        categories (list): list of categories to remove
        output_annotation_file (str): path to output annotation file
        keep_blank_images (bool): keep images with no annotations
    data = json.load(open(annotation_file))
    categories_remove = [category['id'] for category in data['categories'] if category['name'] in categories]
    data['categories'] = [category for category in data['categories'] if category['name'] not in categories]
    category_images_ids = []
    image_ids = []
    annotation_list = []
    for annotation in data['annotations']:
        if annotation['category_id'] in categories_remove:
    if not keep_blank_images:
        image_ids_remove = list(set(category_images_ids) - set(image_ids))
        data['images'] = [image for image in data['images'] if image['id'] not in image_ids_remove]
    data['annotations'] = annotation_list

    with open(output_annotation_file, 'w') as f:
        json.dump(data, f)