Module data_handling.split
Functions to split data into training and validation sets on both coco and yolo formats as well as creating a new annotation file for a given subset of images in the folder.
Expand source code
"""Functions to split data into training and validation sets on both coco and yolo formats as well as
creating a new annotation file for a given subset of images in the folder."""
import json
import os
import random
import shutil
from pathlib import Path
from tqdm import tqdm
random.seed(42)
def annotation_folder(image_folder, annotation_file, output_annotation_file):
"""Create annotation file for a given subset of images in the folder.
Args:
image_folder (str): path to the folder containing the images to filter annotations for.
annotation_file (str): path to the annotation file.
output_annotation_file (str): path to the output annotation file.
"""
data = json.load(open(annotation_file))
annotations = data['annotations']
images = data['images']
images_in_use = os.listdir(image_folder)
image_list = []
image_id_list = []
for image in images:
if image['file_name'] in images_in_use:
image_id_list.append(image['id'])
image_list.append(image)
annotation_list = []
for annotation in annotations:
if annotation['image_id'] in image_id_list:
annotation_list.append(annotation)
data['annotations'] = annotation_list
data['images'] = image_list
with open(output_annotation_file, "w") as outfile:
json.dump(data, outfile)
def train_validation_coco(folder, annotation_file, combined_folder=None, output_folder=None, validation_percentage=10):
"""split coco data into training and validation
Args:
folder (str): path to image folder
annotation_file (str): path to annotation file
combined_folder (str, optional): Path to combined folder for training and validation.
Both will be stored in the same folder with 'train_' and 'val_' names. Defaults to None.
output_folder (str, optional): Path to parent folder which will have seperate training and validation folders. Defaults to None.
validation_percentage (int, optional): Percentage of images to put in validation set. Defaults to 10.
Raises:
Exception: If both combined_folder and output_folder are None.
"""
if output_folder is None and combined_folder is None:
raise Exception('Either output_folder or combined_folder must be specified')
if output_folder is not None:
if not os.path.exists(output_folder):
os.mkdir(output_folder)
if not os.path.exists(os.path.join(output_folder, 'train')):
os.mkdir(os.path.join(output_folder, 'train'))
if not os.path.exists(os.path.join(output_folder, 'train', 'annotations')):
os.mkdir(os.path.join(output_folder, 'train', 'annotations'))
if not os.path.exists(os.path.join(output_folder, 'train', 'images')):
os.mkdir(os.path.join(output_folder, 'train', 'images'))
if not os.path.exists(os.path.join(output_folder, 'validation')):
os.mkdir(os.path.join(output_folder, 'validation'))
if not os.path.exists(os.path.join(output_folder, 'validation', 'annotations')):
os.mkdir(os.path.join(output_folder, 'validation', 'annotations'))
if not os.path.exists(os.path.join(output_folder, 'validation', 'images')):
os.mkdir(os.path.join(output_folder, 'validation', 'images'))
data = json.load(open(annotation_file))
annotations = data['annotations']
images = data['images']
image_ids = []
old_train_list, old_val_list = [], []
for image in images:
if image['file_name'].find('val') == -1 or image['file_name'].find('train') == -1:
image_ids.append(image['id'])
elif image['file_name'].find('val') != -1:
old_val_list.append(image['id'])
elif image['file_name'].find('train') != -1:
old_train_list.append(image['id'])
random.shuffle(image_ids)
validation_size = int(len(image_ids) * validation_percentage / 100)
validation_image_ids = image_ids[:validation_size]
validation_image_ids.extend(old_val_list)
train_image_ids = image_ids[validation_size:]
train_image_ids.extend(old_train_list)
if output_folder is not None:
validation_annotation_list, training_annotation_list = [], []
for annotation in annotations:
if annotation['image_id'] in validation_image_ids:
validation_annotation_list.append(annotation)
elif annotation['image_id'] in train_image_ids:
training_annotation_list.append(annotation)
validation_image_list, training_image_list = [], []
for image in tqdm(images, desc='output folder'):
if image['id'] in validation_image_ids:
if image['file_name'].find('val') != -1:
shutil.copy(os.path.join(folder, image['file_name']),
os.path.join(output_folder, 'validation', 'images', image['file_name']))
else:
adding = 'val_'
counter = 1
while os.path.exists(os.path.join(folder, adding + image['file_name'])):
adding = 'val_' + str(counter) + '_'
counter += 1
shutil.copy(os.path.join(folder, image['file_name']),
os.path.join(output_folder, 'validation', 'images', adding + image['file_name']))
new_image = image.copy()
new_image['file_name'] = adding + new_image['file_name']
validation_image_list.append(new_image)
elif image['id'] in train_image_ids:
if image['file_name'].find('train') != -1:
shutil.copy(os.path.join(folder, image['file_name']),
os.path.join(output_folder, 'train', 'images', image['file_name']))
else:
adding = 'train_'
counter = 1
while os.path.isfile(os.path.join(folder, adding + image['file_name'])):
adding = 'train_' + str(counter) + '_'
counter += 1
shutil.copy(os.path.join(folder, image['file_name']),
os.path.join(output_folder, 'train', 'images', adding + image['file_name']))
new_image = image.copy()
new_image['file_name'] = adding + new_image['file_name']
training_image_list.append(new_image)
data['annotations'] = validation_annotation_list
data['images'] = validation_image_list
with open(os.path.join(output_folder, 'validation', 'annotations', 'instances_val.json'), "w") as outfile:
json.dump(data, outfile)
data['annotations'] = training_annotation_list
data['images'] = training_image_list
with open(os.path.join(output_folder, 'train', 'annotations', 'instances_train.json'), "w") as outfile:
json.dump(data, outfile)
if combined_folder is not None:
if not os.path.exists(combined_folder):
os.mkdir(combined_folder)
if not os.path.exists(os.path.join(combined_folder, 'annotations')):
os.mkdir(os.path.join(combined_folder, 'annotations'))
if not os.path.exists(os.path.join(combined_folder, 'images')):
os.mkdir(os.path.join(combined_folder, 'images'))
image_list = []
for image in tqdm(images, desc='combined folder'):
if image['file_name'].find('val') != -1:
shutil.copy(os.path.join(folder, image['file_name']),
os.path.join(combined_folder, 'images', image['file_name']))
elif image['file_name'].find('train') != -1:
shutil.copy(os.path.join(folder, image['file_name']),
os.path.join(combined_folder, 'images', image['file_name']))
else:
if image['id'] in validation_image_ids:
adding = 'val_'
counter = 1
while os.path.exists(os.path.join(folder, adding + image['file_name'])):
adding = 'val_' + str(counter) + '_'
counter += 1
shutil.copy(os.path.join(folder, image['file_name']),
os.path.join(combined_folder, 'images', adding + image['file_name']))
image['file_name'] = adding + image['file_name']
else:
adding = 'train_'
counter = 1
while os.path.isfile(os.path.join(folder, adding + image['file_name'])):
adding = 'train_' + str(counter) + '_'
counter += 1
shutil.copy(os.path.join(folder, image['file_name']),
os.path.join(combined_folder, 'images', adding + image['file_name']))
image['file_name'] = adding + image['file_name']
image_list.append(image)
data['annotations'] = annotations
data['images'] = image_list
with open(os.path.join(combined_folder, 'annotations', 'instances_all.json'), "w") as outfile:
json.dump(data, outfile)
def train_validation_yolo(folder, output_folder, validation_percentage=10):
""" split yolo data into training and validation
Args:
folder (str): path to folder containing yolo data
output_folder (str): path to folder to save split data
validation_percentage (int): percentage of data to use for validation
"""
if not os.path.exists(output_folder):
os.mkdir(output_folder)
if not os.path.exists(os.path.join(output_folder, 'train')):
os.mkdir(os.path.join(output_folder, 'train'))
if not os.path.exists(os.path.join(output_folder, 'train', 'labels')):
os.mkdir(os.path.join(output_folder, 'train', 'labels'))
if not os.path.exists(os.path.join(output_folder, 'train', 'images')):
os.mkdir(os.path.join(output_folder, 'train', 'images'))
if not os.path.exists(os.path.join(output_folder, 'val')):
os.mkdir(os.path.join(output_folder, 'val'))
if not os.path.exists(os.path.join(output_folder, 'val', 'labels')):
os.mkdir(os.path.join(output_folder, 'val', 'labels'))
if not os.path.exists(os.path.join(output_folder, 'val', 'images')):
os.mkdir(os.path.join(output_folder, 'val', 'images'))
data = []
for img in os.listdir(os.path.join(folder, 'images')):
if img.find('val') == -1 and img.find('train') == -1:
data.append(img)
random.shuffle(data)
validation_size = int(len(data) * validation_percentage / 100)
validation_data = data[:validation_size]
training_data = data[validation_size:]
for img in tqdm(os.listdir(os.path.join(folder, 'images')), desc='Copying images'):
if img.find('val') != -1:
shutil.copy(os.path.join(folder, 'images', img),
os.path.join(output_folder, 'val', 'images', img))
if os.path.exists(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt')):
shutil.copy(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt'),
Path(os.path.join(output_folder, 'val', 'labels', img)).with_suffix('.txt'))
elif img.find('train') != -1:
shutil.copy(os.path.join(folder, 'images', img),
os.path.join(output_folder, 'train', 'images', img))
if os.path.exists(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt')):
shutil.copy(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt'),
Path(os.path.join(output_folder, 'train', 'labels', img)).with_suffix('.txt'))
elif img in validation_data:
adding = 'val_'
counter = 1
while os.path.isfile(os.path.join(folder, 'images', adding + img)):
adding = 'val_' + str(counter) + '_'
counter += 1
shutil.copy(os.path.join(folder, 'images', img),
os.path.join(output_folder, 'val', 'images', adding + img))
if os.path.exists(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt')):
shutil.copy(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt'),
Path(os.path.join(output_folder, 'val', 'labels', adding + img)).with_suffix('.txt'))
elif img in training_data:
adding = 'train_'
counter = 1
while os.path.isfile(os.path.join(folder, 'images', adding + img)):
adding = 'train_' + str(counter) + '_'
counter += 1
shutil.copy(os.path.join(folder, 'images', img),
os.path.join(output_folder, 'train', 'images', adding + img))
if os.path.exists(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt')):
shutil.copy(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt'),
Path(os.path.join(output_folder, 'train', 'labels', adding + img)).with_suffix('.txt'))
if os.path.exists(os.path.join(folder, 'input_data.yaml')):
shutil.copy(os.path.join(folder, 'input_data.yaml'),
os.path.join(output_folder, 'input_data.yaml'))
if __name__ == '__main__':
import sys
train_validation_yolo('/home/vardan/Desktop/testing/yolo', '/home/vardan/Desktop/testing/yoloout', 20)
Functions
def annotation_folder(image_folder, annotation_file, output_annotation_file)
-
Create annotation file for a given subset of images in the folder.
Args
image_folder
:str
- path to the folder containing the images to filter annotations for.
annotation_file
:str
- path to the annotation file.
output_annotation_file
:str
- path to the output annotation file.
Expand source code
def annotation_folder(image_folder, annotation_file, output_annotation_file): """Create annotation file for a given subset of images in the folder. Args: image_folder (str): path to the folder containing the images to filter annotations for. annotation_file (str): path to the annotation file. output_annotation_file (str): path to the output annotation file. """ data = json.load(open(annotation_file)) annotations = data['annotations'] images = data['images'] images_in_use = os.listdir(image_folder) image_list = [] image_id_list = [] for image in images: if image['file_name'] in images_in_use: image_id_list.append(image['id']) image_list.append(image) annotation_list = [] for annotation in annotations: if annotation['image_id'] in image_id_list: annotation_list.append(annotation) data['annotations'] = annotation_list data['images'] = image_list with open(output_annotation_file, "w") as outfile: json.dump(data, outfile)
def train_validation_coco(folder, annotation_file, combined_folder=None, output_folder=None, validation_percentage=10)
-
split coco data into training and validation
Args
folder
:str
- path to image folder
annotation_file
:str
- path to annotation file
combined_folder
:str
, optional- Path to combined folder for training and validation. Both will be stored in the same folder with 'train_' and 'val_' names. Defaults to None.
output_folder
:str
, optional- Path to parent folder which will have seperate training and validation folders. Defaults to None.
validation_percentage
:int
, optional- Percentage of images to put in validation set. Defaults to 10.
Raises
Exception
- If both combined_folder and output_folder are None.
Expand source code
def train_validation_coco(folder, annotation_file, combined_folder=None, output_folder=None, validation_percentage=10): """split coco data into training and validation Args: folder (str): path to image folder annotation_file (str): path to annotation file combined_folder (str, optional): Path to combined folder for training and validation. Both will be stored in the same folder with 'train_' and 'val_' names. Defaults to None. output_folder (str, optional): Path to parent folder which will have seperate training and validation folders. Defaults to None. validation_percentage (int, optional): Percentage of images to put in validation set. Defaults to 10. Raises: Exception: If both combined_folder and output_folder are None. """ if output_folder is None and combined_folder is None: raise Exception('Either output_folder or combined_folder must be specified') if output_folder is not None: if not os.path.exists(output_folder): os.mkdir(output_folder) if not os.path.exists(os.path.join(output_folder, 'train')): os.mkdir(os.path.join(output_folder, 'train')) if not os.path.exists(os.path.join(output_folder, 'train', 'annotations')): os.mkdir(os.path.join(output_folder, 'train', 'annotations')) if not os.path.exists(os.path.join(output_folder, 'train', 'images')): os.mkdir(os.path.join(output_folder, 'train', 'images')) if not os.path.exists(os.path.join(output_folder, 'validation')): os.mkdir(os.path.join(output_folder, 'validation')) if not os.path.exists(os.path.join(output_folder, 'validation', 'annotations')): os.mkdir(os.path.join(output_folder, 'validation', 'annotations')) if not os.path.exists(os.path.join(output_folder, 'validation', 'images')): os.mkdir(os.path.join(output_folder, 'validation', 'images')) data = json.load(open(annotation_file)) annotations = data['annotations'] images = data['images'] image_ids = [] old_train_list, old_val_list = [], [] for image in images: if image['file_name'].find('val') == -1 or image['file_name'].find('train') == -1: image_ids.append(image['id']) elif image['file_name'].find('val') != -1: old_val_list.append(image['id']) elif image['file_name'].find('train') != -1: old_train_list.append(image['id']) random.shuffle(image_ids) validation_size = int(len(image_ids) * validation_percentage / 100) validation_image_ids = image_ids[:validation_size] validation_image_ids.extend(old_val_list) train_image_ids = image_ids[validation_size:] train_image_ids.extend(old_train_list) if output_folder is not None: validation_annotation_list, training_annotation_list = [], [] for annotation in annotations: if annotation['image_id'] in validation_image_ids: validation_annotation_list.append(annotation) elif annotation['image_id'] in train_image_ids: training_annotation_list.append(annotation) validation_image_list, training_image_list = [], [] for image in tqdm(images, desc='output folder'): if image['id'] in validation_image_ids: if image['file_name'].find('val') != -1: shutil.copy(os.path.join(folder, image['file_name']), os.path.join(output_folder, 'validation', 'images', image['file_name'])) else: adding = 'val_' counter = 1 while os.path.exists(os.path.join(folder, adding + image['file_name'])): adding = 'val_' + str(counter) + '_' counter += 1 shutil.copy(os.path.join(folder, image['file_name']), os.path.join(output_folder, 'validation', 'images', adding + image['file_name'])) new_image = image.copy() new_image['file_name'] = adding + new_image['file_name'] validation_image_list.append(new_image) elif image['id'] in train_image_ids: if image['file_name'].find('train') != -1: shutil.copy(os.path.join(folder, image['file_name']), os.path.join(output_folder, 'train', 'images', image['file_name'])) else: adding = 'train_' counter = 1 while os.path.isfile(os.path.join(folder, adding + image['file_name'])): adding = 'train_' + str(counter) + '_' counter += 1 shutil.copy(os.path.join(folder, image['file_name']), os.path.join(output_folder, 'train', 'images', adding + image['file_name'])) new_image = image.copy() new_image['file_name'] = adding + new_image['file_name'] training_image_list.append(new_image) data['annotations'] = validation_annotation_list data['images'] = validation_image_list with open(os.path.join(output_folder, 'validation', 'annotations', 'instances_val.json'), "w") as outfile: json.dump(data, outfile) data['annotations'] = training_annotation_list data['images'] = training_image_list with open(os.path.join(output_folder, 'train', 'annotations', 'instances_train.json'), "w") as outfile: json.dump(data, outfile) if combined_folder is not None: if not os.path.exists(combined_folder): os.mkdir(combined_folder) if not os.path.exists(os.path.join(combined_folder, 'annotations')): os.mkdir(os.path.join(combined_folder, 'annotations')) if not os.path.exists(os.path.join(combined_folder, 'images')): os.mkdir(os.path.join(combined_folder, 'images')) image_list = [] for image in tqdm(images, desc='combined folder'): if image['file_name'].find('val') != -1: shutil.copy(os.path.join(folder, image['file_name']), os.path.join(combined_folder, 'images', image['file_name'])) elif image['file_name'].find('train') != -1: shutil.copy(os.path.join(folder, image['file_name']), os.path.join(combined_folder, 'images', image['file_name'])) else: if image['id'] in validation_image_ids: adding = 'val_' counter = 1 while os.path.exists(os.path.join(folder, adding + image['file_name'])): adding = 'val_' + str(counter) + '_' counter += 1 shutil.copy(os.path.join(folder, image['file_name']), os.path.join(combined_folder, 'images', adding + image['file_name'])) image['file_name'] = adding + image['file_name'] else: adding = 'train_' counter = 1 while os.path.isfile(os.path.join(folder, adding + image['file_name'])): adding = 'train_' + str(counter) + '_' counter += 1 shutil.copy(os.path.join(folder, image['file_name']), os.path.join(combined_folder, 'images', adding + image['file_name'])) image['file_name'] = adding + image['file_name'] image_list.append(image) data['annotations'] = annotations data['images'] = image_list with open(os.path.join(combined_folder, 'annotations', 'instances_all.json'), "w") as outfile: json.dump(data, outfile)
def train_validation_yolo(folder, output_folder, validation_percentage=10)
-
split yolo data into training and validation
Args
folder
:str
- path to folder containing yolo data
output_folder
:str
- path to folder to save split data
validation_percentage
:int
- percentage of data to use for validation
Expand source code
def train_validation_yolo(folder, output_folder, validation_percentage=10): """ split yolo data into training and validation Args: folder (str): path to folder containing yolo data output_folder (str): path to folder to save split data validation_percentage (int): percentage of data to use for validation """ if not os.path.exists(output_folder): os.mkdir(output_folder) if not os.path.exists(os.path.join(output_folder, 'train')): os.mkdir(os.path.join(output_folder, 'train')) if not os.path.exists(os.path.join(output_folder, 'train', 'labels')): os.mkdir(os.path.join(output_folder, 'train', 'labels')) if not os.path.exists(os.path.join(output_folder, 'train', 'images')): os.mkdir(os.path.join(output_folder, 'train', 'images')) if not os.path.exists(os.path.join(output_folder, 'val')): os.mkdir(os.path.join(output_folder, 'val')) if not os.path.exists(os.path.join(output_folder, 'val', 'labels')): os.mkdir(os.path.join(output_folder, 'val', 'labels')) if not os.path.exists(os.path.join(output_folder, 'val', 'images')): os.mkdir(os.path.join(output_folder, 'val', 'images')) data = [] for img in os.listdir(os.path.join(folder, 'images')): if img.find('val') == -1 and img.find('train') == -1: data.append(img) random.shuffle(data) validation_size = int(len(data) * validation_percentage / 100) validation_data = data[:validation_size] training_data = data[validation_size:] for img in tqdm(os.listdir(os.path.join(folder, 'images')), desc='Copying images'): if img.find('val') != -1: shutil.copy(os.path.join(folder, 'images', img), os.path.join(output_folder, 'val', 'images', img)) if os.path.exists(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt')): shutil.copy(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt'), Path(os.path.join(output_folder, 'val', 'labels', img)).with_suffix('.txt')) elif img.find('train') != -1: shutil.copy(os.path.join(folder, 'images', img), os.path.join(output_folder, 'train', 'images', img)) if os.path.exists(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt')): shutil.copy(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt'), Path(os.path.join(output_folder, 'train', 'labels', img)).with_suffix('.txt')) elif img in validation_data: adding = 'val_' counter = 1 while os.path.isfile(os.path.join(folder, 'images', adding + img)): adding = 'val_' + str(counter) + '_' counter += 1 shutil.copy(os.path.join(folder, 'images', img), os.path.join(output_folder, 'val', 'images', adding + img)) if os.path.exists(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt')): shutil.copy(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt'), Path(os.path.join(output_folder, 'val', 'labels', adding + img)).with_suffix('.txt')) elif img in training_data: adding = 'train_' counter = 1 while os.path.isfile(os.path.join(folder, 'images', adding + img)): adding = 'train_' + str(counter) + '_' counter += 1 shutil.copy(os.path.join(folder, 'images', img), os.path.join(output_folder, 'train', 'images', adding + img)) if os.path.exists(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt')): shutil.copy(Path(os.path.join(folder, 'labels', img)).with_suffix('.txt'), Path(os.path.join(output_folder, 'train', 'labels', adding + img)).with_suffix('.txt')) if os.path.exists(os.path.join(folder, 'input_data.yaml')): shutil.copy(os.path.join(folder, 'input_data.yaml'), os.path.join(output_folder, 'input_data.yaml'))