fix: create cogent dataset

fa9883e4 · Jérôme Botoko Ekila · 439cae36 · fa9883e4
Commit fa9883e4 authored 1 year ago by Jérôme Botoko Ekila
--- a/scripts/preprocess_data/cogent/preprocess.py
+++ b/scripts/preprocess_data/cogent/preprocess.py
+import argparse
+import json
+import logging
+import os
+import shutil
+
+import cv2 as cv
+import numpy as np
+from detectron2 import model_zoo
+from detectron2.config import get_cfg
+from detectron2.engine import DefaultPredictor
+from scipy.spatial import distance as d
+from tqdm import tqdm
+
+from nmn.datasets.generate.utils.masks import calculate_mid_mask, convert_array_to_rle
+from nmn.utils.config import load_json, load_module_config
+from nmn.utils.dotdict import DotDict
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--use_cuda", action="store_true")
+
+
+def split_data() -> None:
+    """Split a random part of the training set to create a held-out validation set."""
+    logging.info("Reading datasets...")
+    os.rename("data/cogent/scenes", "data/cogent/raw_scenes")
+    trainA = load_json("data/cogent/raw_scenes/CLEVR_trainA_scenes.json")
+    testA = load_json("data/cogent/raw_scenes/CLEVR_valA_scenes.json")
+    testB = load_json("data/cogent/raw_scenes/CLEVR_valB_scenes.json")
+
+    scenes = np.array(trainA["scenes"])
+
+    trainA_scenes = scenes[:60000]
+    valA_scenes = scenes[60000:]
+    testA_scenes = np.array(testA["scenes"])
+    testB_scenes = np.array(testB["scenes"])
+
+    os.makedirs("data/cogent/raw_scenes", exist_ok=True)
+    for dataset, scenes in zip(
+        ["trainA", "valA", "testA", "testB"],
+        [trainA_scenes, valA_scenes, testA_scenes, testB_scenes],
+    ):
+        with open(f"data/cogent/raw_scenes/CLEVR_{dataset}_splitscenes.json", "w") as f:
+            train = {
+                "info": f"COGENT {dataset} dataset annotated with object mask in coco rle format",
+                "scenes": scenes.tolist(),
+            }
+            logging.info(f"Writing {dataset} set...")
+            json.dump(train, f)
+
+
+def move_images() -> None:
+    os.rename("data/cogent/images/testA", "data/cogent/images/unusedA")
+    os.rename("data/cogent/images/testB", "data/cogent/images/unusedB")
+    os.rename("data/cogent/images/valA", "data/cogent/images/testA")
+    os.rename("data/cogent/images/valB", "data/cogent/images/testB")
+    os.makedirs("data/cogent/images/valA", exist_ok=True)
+    for i in range(60000, 70000):
+        in_loc = f"data/cogent/images/trainA/CLEVR_trainA_{str(i).zfill(6)}.png"
+        out_loc = f"data/cogent/images/valA/CLEVR_trainA_{str(i).zfill(6)}.png"
+        shutil.move(in_loc, out_loc)
+
+
+CATEGORY_TO_SHAPE = {0: "sphere", 1: "cylinder", 2: "cube"}
+
+
+def load_detectron(use_cuda: bool, module_info: DotDict) -> DefaultPredictor:
+    cfg = get_cfg()
+    cfg.merge_from_file(model_zoo.get_config_file(module_info["architecture_path"]))
+    cfg.DATASETS.TRAIN = ("clevr_mini_train",)
+    cfg.DATASETS.TEST = ()
+    cfg.DATALOADER.NUM_WORKERS = 0
+    cfg.MODEL.DEVICE = "cuda:0" if use_cuda else "cpu"
+    cfg.MODEL.WEIGHTS = module_info.weights
+    cfg.INPUT.MASK_FORMAT = module_info.mask_format
+    cfg.MODEL.ROI_HEADS.NUM_CLASSES = module_info.n_classes
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = module_info.threshold
+    return DefaultPredictor(cfg)
+
+
+def load_img(dataset, filename) -> np.ndarray:
+    img_path = os.path.join(f"data/cogent/images/{dataset}", filename)
+    img = cv.imread(img_path)
+    img = cv.cvtColor(img, cv.COLOR_RGBA2RGB)
+    return img
+
+
+def generate_masks(use_cuda: bool, module_info: DotDict):
+    # Load detectron
+    logging.info("Loading Detectron...")
+    detectron: DefaultPredictor = load_detectron(use_cuda, module_info)
+
+    for dataset in ["trainA", "valA", "testA"]:
+        # Load the scenes
+        logging.info("Loading scenes...")
+        path = f"data/cogent/raw_scenes/CLEVR_{dataset}_splitscenes.json"
+        data = load_json(path)
+        scenes = data["scenes"]
+
+        # Output dictionary
+        image_idxs = []
+        object_masks = []
+        shapes = []
+
+        # For each scene
+        logging.info("Processing scenes...")
+        for scene in tqdm(scenes):
+            # load the image
+            img: np.ndarray = load_img(dataset, scene["image_filename"])
+            # run detectron
+            outputs = detectron(img)
+            # grab the predictions and append to masks
+            masks = outputs["instances"].pred_masks
+            shape_classes = outputs["instances"].pred_classes.cpu().numpy()
+            shape_strs = [CATEGORY_TO_SHAPE[c] for c in shape_classes]
+            for mask, shape_str in zip(masks, shape_strs):
+                mask = mask.cpu().numpy()
+                mask = convert_array_to_rle(mask)
+                image_idxs.append(scene["image_index"])
+                object_masks.append(mask)
+                shapes.append(shape_str)
+        logging.info("Done processing scenes")
+        # Create output dictionary
+        masks = {
+            "image_idxs": image_idxs,
+            "object_masks": object_masks,
+            "shapes": shapes,
+        }
+        # Write to output [not needed]
+        with open(f"data/cogent/raw_scenes/CLEVR_{dataset}_masks.json", "w") as f:
+            json.dump(masks, f)
+
+
+def match_masks() -> None:
+    """
+    Matches the objects in scenes to calculated masks
+    and outputs the combination in a common file.
+
+    In principle this solution is not watertight.
+    In rare cases this algorithm will fail match masks properly,
+        when the center of two objects are very close.
+    This will result in objects having 0 masks.
+    This is scenario is quite rare however and
+        has only be observed a handful of times
+        in a dataset of millions of objects.
+    """
+    os.makedirs("data/cogent/scenes", exist_ok=True)
+    for dataset in ["trainA", "valA", "testA"]:
+
+        logging.info("Loading masks...")
+        masks = load_json(f"data/cogent/raw_scenes/CLEVR_{dataset}_masks.json")
+        logging.info("Loading CLEVR data...")
+        data = load_json(f"data/cogent/raw_scenes/CLEVR_{dataset}_splitscenes.json")
+        scenes = data["scenes"]
+
+        logging.info("Matching masks to objects...")
+        annotated_scenes = []
+        for scene in tqdm(scenes):
+            # get the image index
+            img_idx = scene["image_index"]
+            # retrieve the indices of the masks associated with index
+            mask_indices = [
+                i for i, x in enumerate(masks["image_idxs"]) if x == img_idx
+            ]
+            # get the masks
+            RLE_masks = [masks["object_masks"][i] for i in mask_indices]
+            if len(RLE_masks) == len(scene["objects"]):
+                # calculate middle coordinate for each mask
+                mask_coords = [calculate_mid_mask(m) for m in RLE_masks]
+                # for each object in the scene find associated mask
+                assigned_idxs = []
+                for obj in scene["objects"]:
+                    obj_coord = obj["pixel_coords"][0:2]
+                    # calculate the distance from the object to all masks
+                    distances = [
+                        d.euclidean(obj_coord, mask_coord) for mask_coord in mask_coords
+                    ]
+                    # Combine the indices, masks and distances and sort them
+                    candidates = list(zip(mask_indices, RLE_masks, distances))
+                    sorted_candidates = sorted(candidates, key=lambda t: t[2])
+                    # take the closest mask that is not yet assigned
+                    for _, (idx, RLE_mask, _) in enumerate(sorted_candidates):
+                        if idx not in assigned_idxs:
+                            assigned_idxs.append(idx)
+                            obj["mask"] = RLE_mask
+                            break
+
+                # add to the annotated scenes list
+                annotated_scenes.append(scene)
+        # output data
+        annotated_data = {
+            "info": f"COGENT dataset {dataset} annotated with object mask in coco rle format",
+            "scenes": annotated_scenes,
+        }
+        with open(f"data/cogent/scenes/{dataset}.json", "w") as f:
+            json.dump(annotated_data, f)
+
+
+def main() -> None:
+    args = parser.parse_args()
+    args.experiment = "clevr"
+    args.module_config = "detectron.json"
+    load_module_config(args)
+    # split_data()
+    # move_images()
+    generate_masks(
+        args.use_cuda,
+        args.module_info,
+    )
+    # match_masks()
+
+
+if __name__ == "__main__":
+    main()