Source code for domid.dsets.generate_dataset_dataframe_her2

import os
import sys

import numpy as np
import pandas as pd

try:
    path = sys.argv[1]
except IndexError:
    path = "../../HER2/combined_train"
print(f"HER2 data directory: {path}")


[docs]def get_jpg_folders(path): """ only keep folders of .jpg images, which folder names by convention end in jpg """ folders = os.listdir(path) jpg_folders = list( filter( lambda f: f.endswith("jpg") and os.path.isdir(os.path.join(path, f)), folders, ) ) return jpg_folders
[docs]def total_count_images(path): folders = get_jpg_folders(path) counter = 0 for folder in folders: folder_path = os.path.join(path, folder) images = os.listdir(folder_path) counter += len(images) print(folder, len(images), counter) return counter
[docs]def parse_machine_labels(image_names): machine_labels = [] machine_dict = {"FD": 0, "H1": 1, "H2": 1, "ND": 2} for image in image_names: machine = image[-6:-4] machine = machine_dict[machine] machine_labels.append(machine) return machine_labels
[docs]def mean_scores_per_experiment(scores, img_locs): """ Parser to get mean scores per image from the cvs file. The name of the images in the folders are slightly different from the names in the csv file. """ M = [] for image_loc in img_locs: try: image_loc = str(image_loc.split("/")[-1]) # depending if the path is full or not, take the img name only except: "not full path" N = len(image_loc) - 6 # removes the _machine.jpg part from the name of the image mean_score = scores.loc[scores["file name"].str.contains(image_loc[:N])].mean(axis=1) mean_score = float(mean_score) # print(mean_score) M.append(mean_score) return M
if __name__ == "__main__": folders = get_jpg_folders(path) N = total_count_images(path) number_labels = 3 data = np.zeros((N, number_labels + 1)).astype("str") start = 0 for folder in folders: print(folder) print("start", start) label_of_the_folder_int = int(folder[-4]) folder_path = os.path.join(path, folder) images = os.listdir(folder_path) labels = [label_of_the_folder_int] * len(images) machine_labels = parse_machine_labels(images) base_path_scores = os.path.join(*path.split("/")[:-1]) # base_path = "/your/data/location" scores = pd.read_csv( os.path.join(base_path_scores, "truthfile_002.csv"), names=["num", "file name", "s1", "s2", "s3", "s4", "s5", "s6", "s_7"], ) individual_scores = mean_scores_per_experiment(scores, images) data[start : start + len(images), :] = np.stack((images, labels, machine_labels, individual_scores), 0).T start += len(images) print(len(images)) dataframe = pd.DataFrame(data) csv_path = os.path.join(path, "dataframe.csv") print(csv_path) print(os.listdir(path)) dataframe.to_csv( csv_path, header=["img_id", "class", "machine", "score"], index=False, )