数据集:
KETI-AIR/coco
import os from datasets import load_dataset from PIL import Image, ImageFont, ImageDraw, ImageColor def calc_lum(rgb): return (0.2126*rgb[0] + 0.7152*rgb[1] + 0.0722*rgb[2]) COLOR_MAP = [ImageColor.getrgb(code) for name, code in ImageColor.colormap.items()] def get_text_bbox(bb, tbb, margin, im_w, im_h, anchor="leftBottom"): m = margin l, t, r, b = bb tl, tt, tr, tb = tbb bbw, bbh = r - l, b - t tbbw, tbbh = tr - tl, tb - tt # bbox (left-top) if anchor == "leftTop": ax, ay = l, t if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-bottom) x1, y1 = max(ax, 0), max(ay - tb - 2*m, 0) x2, y2 = min(x1 + tr + 2*m, im_w), min(y1 + tb + 2*m, im_h) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-top) x1, y1 = max(ax, 0), max(ay, 0) x2, y2 = min(x1 + tr + 2*m, im_w), min(y1 + tb + 2*m, im_h) return (( x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) elif anchor == "rightTop": ax, ay = r, t if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-bottom) x2, y1 = max(ax, 0), max(ay - tb - 2*m, 0) x1, y2 = max(x2 - tr - 2*m, 0), min(y1 + tb + 2*m, im_h) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-top) x2, y1 = max(ax, 0), max(ay, 0) x1, y2 = max(x2 - tr - 2*m, 0), min(y1 + tb + 2*m, im_h) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) elif anchor == "rightBottom": ax, ay = r, b if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-top) x2, y2 = min(ax, im_w), min(ay + tb + 2*m, im_h) x1, y1 = max(x2 - tr - 2*m, 0), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-bottom) x2, y2 = min(ax, im_w), max(ay, 0) x1, y1 = max(x2 - tr - 2*m, 0), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) elif anchor == "leftBottom": ax, ay = l, b if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-top) x1, y2 = min(ax, im_w), min(ay + tb + 2*m, im_h) x2, y1 = min(x1 + tr + 2*m, im_w), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-bottom) x1, y2 = min(ax, im_w), max(ay, 0) x2, y1 = min(x1 + tr + 2*m, im_w), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) elif anchor == "centerBottom": ax, ay = (l+r)//2, b if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-top) x1, y2 = min(ax - tr//2 - m, im_w), min(ay + tb + 2*m, im_h) x2, y1 = min(x1 + tr + 2*m, im_w), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-bottom) x1, y2 = min(ax - tr//2 - m, im_w), max(ay, 0) x2, y1 = min(x1 + tr + 2*m, im_w), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) def draw_bbox(image, objects, out_path, label_names=None, font="Roboto-Bold.ttf", fontsize=15, fill=True, opacity=60, width=2, margin=3, anchor="leftBottom"): fnt = ImageFont.truetype(font, fontsize) im_w, im_h = image.size img = image.convert("RGBA") overlay = Image.new('RGBA', img.size, (0, 0, 0, 0)) draw = ImageDraw.Draw(overlay) for bb, lbl_id in zip(objects["bbox"], objects["label"]): c = COLOR_MAP[min(lbl_id, len(COLOR_MAP)-1)] fill_c = c + (opacity, ) if fill else None draw.rectangle((bb[0], bb[1], bb[2], bb[3]), outline=c, fill=fill_c, width=width) text = "" if label_names is not None: text = label_names[lbl_id] tbb = fnt.getbbox(text) btn_bbox, text_pos = get_text_bbox(bb, tbb, margin, im_w, im_h, anchor) fc = (0, 0, 0) if calc_lum(c) > 150 else (255, 255, 255) draw.rectangle(btn_bbox, outline=c, fill=c + (255, )) draw.text(text_pos, text, font=fnt, fill=fc + (255, )) img = Image.alpha_composite(img, overlay) overlay = Image.new('RGBA', img.size, (0, 0, 0, 0)) draw = ImageDraw.Draw(overlay) img = img.convert("RGB") img.save(out_path) raw_datasets = load_dataset( "coco.py", "2017", cache_dir="./huggingface_datasets", ) train_dataset = raw_datasets["train"] label_list = raw_datasets["train"].features["objects"].feature['label'].names for idx, item in zip(range(10), train_dataset): draw_bbox(item["image"], item["objects"], item["image/filename"], label_list)
import numpy as np from datasets import load_dataset from PIL import Image, ImageFont, ImageDraw, ImageColor from transformers.image_transforms import ( rgb_to_id, ) def calc_lum(rgb): return (0.2126*rgb[0] + 0.7152*rgb[1] + 0.0722*rgb[2]) COLOR_MAP = [ImageColor.getrgb(code) for name, code in ImageColor.colormap.items()] def get_text_bbox(bb, tbb, margin, im_w, im_h, anchor="leftBottom"): m = margin l, t, r, b = bb tl, tt, tr, tb = tbb bbw, bbh = r - l, b - t tbbw, tbbh = tr - tl, tb - tt # bbox (left-top) if anchor == "leftTop": ax, ay = l, t if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-bottom) x1, y1 = max(ax, 0), max(ay - tb - 2*m, 0) x2, y2 = min(x1 + tr + 2*m, im_w), min(y1 + tb + 2*m, im_h) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-top) x1, y1 = max(ax, 0), max(ay, 0) x2, y2 = min(x1 + tr + 2*m, im_w), min(y1 + tb + 2*m, im_h) return (( x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) elif anchor == "rightTop": ax, ay = r, t if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-bottom) x2, y1 = max(ax, 0), max(ay - tb - 2*m, 0) x1, y2 = max(x2 - tr - 2*m, 0), min(y1 + tb + 2*m, im_h) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-top) x2, y1 = max(ax, 0), max(ay, 0) x1, y2 = max(x2 - tr - 2*m, 0), min(y1 + tb + 2*m, im_h) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) elif anchor == "rightBottom": ax, ay = r, b if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-top) x2, y2 = min(ax, im_w), min(ay + tb + 2*m, im_h) x1, y1 = max(x2 - tr - 2*m, 0), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-bottom) x2, y2 = min(ax, im_w), max(ay, 0) x1, y1 = max(x2 - tr - 2*m, 0), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) elif anchor == "leftBottom": ax, ay = l, b if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-top) x1, y2 = min(ax, im_w), min(ay + tb + 2*m, im_h) x2, y1 = min(x1 + tr + 2*m, im_w), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-bottom) x1, y2 = min(ax, im_w), max(ay, 0) x2, y1 = min(x1 + tr + 2*m, im_w), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) elif anchor == "centerBottom": ax, ay = (l+r)//2, b if tbbw*3 > bbw or tbbh*4 > bbh: # align (text box: left-top) x1, y2 = min(ax - tr//2 - m, im_w), min(ay + tb + 2*m, im_h) x2, y1 = min(x1 + tr + 2*m, im_w), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) else: # align (text box: left-bottom) x1, y2 = min(ax - tr//2 - m, im_w), max(ay, 0) x2, y1 = min(x1 + tr + 2*m, im_w), max(y2 - tb - 2*m, 0) return ((x1, y1, x2, y2), (max(x1+m, 0), max(y1+m, 0))) # Copied from transformers.models.detr.image_processing_detr.masks_to_boxes def masks_to_boxes(masks: np.ndarray) -> np.ndarray: """ Compute the bounding boxes around the provided panoptic segmentation masks. Args: masks: masks in format `[number_masks, height, width]` where N is the number of masks Returns: boxes: bounding boxes in format `[number_masks, 4]` in xyxy format """ if masks.size == 0: return np.zeros((0, 4)) h, w = masks.shape[-2:] y = np.arange(0, h, dtype=np.float32) x = np.arange(0, w, dtype=np.float32) # see https://github.com/pytorch/pytorch/issues/50276 y, x = np.meshgrid(y, x, indexing="ij") x_mask = masks * np.expand_dims(x, axis=0) x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) x_min = x.filled(fill_value=1e8) x_min = x_min.reshape(x_min.shape[0], -1).min(-1) y_mask = masks * np.expand_dims(y, axis=0) y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) y_min = y.filled(fill_value=1e8) y_min = y_min.reshape(y_min.shape[0], -1).min(-1) return np.stack([x_min, y_min, x_max, y_max], 1) def draw_seg(image, panoptic_image, oids, labels, out_path, label_names=None, font="Roboto-Bold.ttf", fontsize=15, opacity=160, anchor="leftBottom"): fnt = ImageFont.truetype(font, fontsize) im_w, im_h = image.size masks = np.asarray(panoptic_image, dtype=np.uint32) masks = rgb_to_id(masks) oids = np.array(oids, dtype=np.uint32) masks = masks == oids[:, None, None] masks = masks.astype(np.uint8) bboxes = masks_to_boxes(masks) img = image.convert("RGBA") for label, mask, bbox in zip(labels, masks, bboxes): c = COLOR_MAP[min(label, len(COLOR_MAP)-1)] cf = np.array(c + (opacity, )).astype(np.uint8) cmask = mask[:, :, None] * cf[None, None, :] cmask = Image.fromarray(cmask) img = Image.alpha_composite(img, cmask) if label_names is not None: text = label_names[label] tbb = fnt.getbbox(text) btn_bbox, text_pos = get_text_bbox(bbox, tbb, 3, im_w, im_h, anchor=anchor) overlay = Image.new('RGBA', img.size, (0, 0, 0, 0)) draw = ImageDraw.Draw(overlay) fc = (0, 0, 0) if calc_lum(c) > 150 else (255, 255, 255) draw.rectangle(btn_bbox, outline=c, fill=c + (255, )) draw.text(text_pos, text, font=fnt, fill=fc + (255, )) img = Image.alpha_composite(img, overlay) img = img.convert("RGB") img.save(out_path) raw_datasets = load_dataset( "coco.py", "2017_panoptic", cache_dir="./huggingface_datasets", # data_dir="./data", ) train_dataset = raw_datasets["train"] label_list = raw_datasets["train"].features["panoptic_objects"].feature['label'].names for idx, item in zip(range(10), train_dataset): draw_seg( item["image"], item["panoptic_image"], item["panoptic_objects"]["id"], item["panoptic_objects"]["label"], "panoptic_" + item["image/filename"], label_list)