Source code for domid.dsets.generate_dataset_dataframe_her2
import os
import sys
import numpy as np
import pandas as pd
try:
path = sys.argv[1]
except IndexError:
path = "../../HER2/combined_train"
print(f"HER2 data directory: {path}")
[docs]def get_jpg_folders(path):
"""
only keep folders of .jpg images,
which folder names by convention end in jpg
"""
folders = os.listdir(path)
jpg_folders = list(
filter(
lambda f: f.endswith("jpg") and os.path.isdir(os.path.join(path, f)),
folders,
)
)
return jpg_folders
[docs]def total_count_images(path):
folders = get_jpg_folders(path)
counter = 0
for folder in folders:
folder_path = os.path.join(path, folder)
images = os.listdir(folder_path)
counter += len(images)
print(folder, len(images), counter)
return counter
[docs]def parse_machine_labels(image_names):
machine_labels = []
machine_dict = {"FD": 0, "H1": 1, "H2": 1, "ND": 2}
for image in image_names:
machine = image[-6:-4]
machine = machine_dict[machine]
machine_labels.append(machine)
return machine_labels
[docs]def mean_scores_per_experiment(scores, img_locs):
"""
Parser to get mean scores per image from the cvs file.
The name of the images in the folders are slightly different from the names in the csv file.
"""
M = []
for image_loc in img_locs:
try:
image_loc = str(image_loc.split("/")[-1]) # depending if the path is full or not, take the img name only
except:
"not full path"
N = len(image_loc) - 6 # removes the _machine.jpg part from the name of the image
mean_score = scores.loc[scores["file name"].str.contains(image_loc[:N])].mean(axis=1)
mean_score = float(mean_score)
# print(mean_score)
M.append(mean_score)
return M
if __name__ == "__main__":
folders = get_jpg_folders(path)
N = total_count_images(path)
number_labels = 3
data = np.zeros((N, number_labels + 1)).astype("str")
start = 0
for folder in folders:
print(folder)
print("start", start)
label_of_the_folder_int = int(folder[-4])
folder_path = os.path.join(path, folder)
images = os.listdir(folder_path)
labels = [label_of_the_folder_int] * len(images)
machine_labels = parse_machine_labels(images)
base_path_scores = os.path.join(*path.split("/")[:-1])
# base_path = "/your/data/location"
scores = pd.read_csv(
os.path.join(base_path_scores, "truthfile_002.csv"),
names=["num", "file name", "s1", "s2", "s3", "s4", "s5", "s6", "s_7"],
)
individual_scores = mean_scores_per_experiment(scores, images)
data[start : start + len(images), :] = np.stack((images, labels, machine_labels, individual_scores), 0).T
start += len(images)
print(len(images))
dataframe = pd.DataFrame(data)
csv_path = os.path.join(path, "dataframe.csv")
print(csv_path)
print(os.listdir(path))
dataframe.to_csv(
csv_path,
header=["img_id", "class", "machine", "score"],
index=False,
)