Spaces:

owkin
/

substra

Runtime error

@@ -1,4 +1,11 @@
 import gradio as gr
 theme = gr.themes.Default(primary_hue="blue").set(
@@ -7,9 +14,49 @@ theme = gr.themes.Default(primary_hue="blue").set(
 )
 demo = gr.Blocks(theme=theme, css="""\
 .gradio-container {
-    width: 100%;
 }
 .margin-top {
@@ -26,19 +73,24 @@ demo = gr.Blocks(theme=theme, css="""\
 }
 .blue {
-    /**
     background-image: url("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/substra-banner.png");
     background-size: cover;
-    **/
-    background-color: #223fb3;
 }
 .blue p {
     color: white !important;
 }
 .info-box {
     background: transparent !important;
 }
 """)
@@ -49,7 +101,7 @@ with demo:
     gr.Markdown("# Federated  Learning with Substra")
     with gr.Row():
         with gr.Column(scale=1, elem_classes=["blue", "column"]):
-            gr.Markdown("Here you can run a quick simulation of Federated Learning with Substra.")
             gr.Markdown("Check out the accompanying blog post to learn more.")
             with gr.Box(elem_classes=["info-box"]):
                 gr.Markdown("""\
@@ -60,22 +112,23 @@ with demo:
         with gr.Column(scale=3, elem_classes=["white", "column"]):
             gr.Markdown("""\
             Data scientists doing medical research often face a shortage of high quality and diverse data to \
-            effectively train models. This challenge can be overcome by securely allowing training on pro- tected \
-            data through (Federated Learning). Substra is a Python based Federated Learning soft- ware that \
-            enables researchers to easily train ML models on remote data regardless of the ML library they are \
-            using or the data modality they are working with.\
             """)
-            gr.Markdown("### Here we show an example of image data located in two different hospitals.")
             gr.Markdown("""\
-            By playing with the distribution of data in the 2 simulated hospitals, you'll be able to compare how \
             the federated models compare with models trained on single datasets. The data used is from the \
-            Camelyon17 dataset, a commonly used benchmark in the medical world that comes from this challenge. \
-            The sample below shows normal cells on the left compared with cancer cells on the right.\
             """)
             gr.HTML("""
             <img
             src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/substra-tumor.png"
-            style="padding: 20px 150px;"
             />
             """)
             gr.Markdown("""\
@@ -87,8 +140,21 @@ with demo:
             """)
             with gr.Row(elem_classes=["margin-top"]):
-                gr.Slider()
-                gr.Slider()
-                gr.Button(value="Launch Experiment 🚀")
 demo.launch()

 import gradio as gr
+import uuid
+import asyncio
+from substra_launcher import launch_substra_space
+from huggingface_hub import HfApi
+hf_api = HfApi()
 theme = gr.themes.Default(primary_hue="blue").set(
 )
+async def launch_experiment(hospital_a, hospital_b):
+    experiment_id = str(uuid.uuid4())
+    asyncio.create_task(launch_substra_space(
+        hf_api=hf_api,
+        repo_id=experiment_id,
+        hospital_a=hospital_a,
+        hospital_b=hospital_b,
+    ))
+    url = f"https://hf.space/NimaBoscarino/{experiment_id}"
+    return (
+        gr.Button.update(interactive=False),
+        gr.Markdown.update(
+            visible=True,
+            value=f"Your experiment is available at [hf.space/NimaBoscarino/{experiment_id}]({url})!"
+        )
+    )
 demo = gr.Blocks(theme=theme, css="""\
+@font-face {
+    font-family: "Didact Gothic";
+    src: url('https://huggingface.co/datasets/NimaBoscarino/assets/resolve/main/substra/DidactGothic-Regular.ttf') format('truetype');
+}
+@font-face {
+    font-family: "Inter";
+    src: url('https://huggingface.co/datasets/NimaBoscarino/assets/resolve/main/substra/Inter-Regular.ttf') format('truetype');
+}
+h1 {
+    font-family: "Didact Gothic";
+    font-size: 40px !important;
+}
+p {
+    font-family: "Inter";
+}
 .gradio-container {
+    min-width: 100% !important;
 }
 .margin-top {
 }
 .blue {
     background-image: url("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/substra-banner.png");
     background-size: cover;
 }
 .blue p {
     color: white !important;
 }
+.blue strong {
+    color: white !important;
+}
 .info-box {
     background: transparent !important;
+    border-radius: 20px !important;
+    border-color: white !important;
+    border-width: 4px !important;
+    padding: 20px !important;
 }
 """)
     gr.Markdown("# Federated  Learning with Substra")
     with gr.Row():
         with gr.Column(scale=1, elem_classes=["blue", "column"]):
+            gr.Markdown("Here you can run a **quick simulation of Federated Learning**.")
             gr.Markdown("Check out the accompanying blog post to learn more.")
             with gr.Box(elem_classes=["info-box"]):
                 gr.Markdown("""\
         with gr.Column(scale=3, elem_classes=["white", "column"]):
             gr.Markdown("""\
             Data scientists doing medical research often face a shortage of high quality and diverse data to \
+            effectively train models. This challenge can be overcome by securely allowing training on protected \
+            data through Federated Learning. [Substra](https://docs.substra.org/) is a Python based Federated \
+            Learning software that enables researchers to easily train ML models on remote data regardless of the \
+            ML library they are using or the data type they are working with.
             """)
+            gr.Markdown("### Here we show an example of image data located in **two different hospitals**.")
             gr.Markdown("""\
+            By playing with the distribution of data in the two simulated hospitals, you'll be able to compare how \
             the federated models compare with models trained on single datasets. The data used is from the \
+            Camelyon17 dataset, a commonly used benchmark in the medical world that comes from \
+            [this challenge](https://camelyon17.grand-challenge.org/). The sample below shows normal cells on the \
+            left compared with cancer cells on the right.
             """)
             gr.HTML("""
             <img
             src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/substra-tumor.png"
+            style="height: 300px; margin: auto;"
             />
             """)
             gr.Markdown("""\
             """)
             with gr.Row(elem_classes=["margin-top"]):
+                hospital_a_slider = gr.Slider(
+                    label="Percentage of positive samples in Hospital A",
+                    value=50,
+                )
+                hospital_b_slider = gr.Slider(
+                    label="Percentage of positive samples in Hospital B",
+                    value=50,
+                )
+            launch_experiment_button = gr.Button(value="Launch Experiment 🚀")
+            visit_experiment_text = gr.Markdown(visible=False)
+    launch_experiment_button.click(
+        fn=launch_experiment,
+        inputs=[hospital_a_slider, hospital_b_slider],
+        outputs=[launch_experiment_button, visit_experiment_text]
+    )
 demo.launch()

fonts/DidactGothic-Regular.ttf ADDED Viewed

Binary file (181 kB). View file

fonts/Inter-Regular.ttf ADDED Viewed

Binary file (748 kB). View file

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-gradio==3.23.0
 pytest
 huggingface_hub

+gradio
 pytest
 huggingface_hub

substra_launcher.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from huggingface_hub import HfApi, RepoUrl
-def launch_substra_space(hf_api: HfApi, num_hospitals: int, repo_id: str) -> RepoUrl:
     repo_id = "NimaBoscarino/" + repo_id
     repo_url = hf_api.create_repo(
@@ -13,12 +16,13 @@ def launch_substra_space(hf_api: HfApi, num_hospitals: int, repo_id: str) -> Rep
     hf_api.upload_folder(
         repo_id=repo_id,
         repo_type="space",
-        folder_path="substra_template/"
     )
     ENV_FILE = f"""\
-SUBSTRA_NUM_HOSPITALS={num_hospitals}
-    """
     hf_api.upload_file(
         repo_id=repo_id,

 from huggingface_hub import HfApi, RepoUrl
+async def launch_substra_space(
+        hf_api: HfApi, repo_id: str,
+        hospital_a: int, hospital_b: int,
+) -> RepoUrl:
     repo_id = "NimaBoscarino/" + repo_id
     repo_url = hf_api.create_repo(
     hf_api.upload_folder(
         repo_id=repo_id,
         repo_type="space",
+        folder_path="./substra_template/"
     )
     ENV_FILE = f"""\
+SUBSTRA_ORG1_DISTR={hospital_a / 100}
+SUBSTRA_ORG2_DISTR={hospital_b / 100}\
+"""
     hf_api.upload_file(
         repo_id=repo_id,

substra_template/Dockerfile CHANGED Viewed

@@ -1,31 +1,3 @@
-FROM python:3.10
-# Set the working directory to /code
-WORKDIR /code
-# Copy the current directory contents into the container at /code
-COPY ./requirements.txt /code/requirements.txt
-COPY ./mlflow-2.1.2.dev0-py3-none-any.whl /code/mlflow-2.1.2.dev0-py3-none-any.whl
-# Install requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-RUN chmod -R 777 /usr/local/lib/python3.10/site-packages/
-# Set up a new user named "user" with user ID 1000
-RUN useradd -m -u 1000 user
-# Switch to the "user" user
-USER user
-# Set home to the user's home directory
-ENV HOME=/home/user \
-	PATH=/home/user/.local/bin:$PATH
-# Set the working directory to the user's home directory
-WORKDIR $HOME/app
-# Copy the current directory contents into the container at $HOME/app setting the owner to the user
-COPY --chown=user . $HOME/app
-RUN chmod -R 777 $HOME/app/
-EXPOSE 7860
-CMD ["bash", "run.sh"]


1	+ FROM nimaboscarino/substra-trainer:latest
2
3	+ CMD ["bash", "docker-run.sh"]

substra_template/README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Substra Trainer
+emoji: 🚀
+colorFrom: red
+colorTo: gray
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

substra_template/__init__.py DELETED Viewed

File without changes

substra_template/mlflow-2.1.2.dev0-py3-none-any.whl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e1f15359f38fab62f43a7a3d51f56c86c882a4cb1c3dcabeda6daf5dc47f1613
-size 17638174

substra_template/mlflow_live_performances.py DELETED Viewed

@@ -1,45 +0,0 @@
-import pandas as pd
-import json
-from pathlib import Path
-from mlflow import log_metric
-import time
-import os
-from glob import glob
-TIMEOUT = 240  # Number of seconds to stop the script after the last update of the json file
-POLLING_FREQUENCY = 10  # Try to read the updates in the file every 10 seconds
-# Wait for the file to be found
-start = time.time()
-while not len(glob(str(Path("local-worker") / "live_performances" / "*" / "performances.json"))) > 0:
-    time.sleep(POLLING_FREQUENCY)
-    if time.time() - start >= TIMEOUT:
-        raise TimeoutError("The performance file does not exist, maybe no test task has been executed yet.")
-path_to_json = Path(glob(str(Path("local-worker") / "live_performances" / "*" / "performances.json"))[0])
-logged_rows = []
-last_update = time.time()
-while (time.time() - last_update) <= TIMEOUT:
-    if last_update == os.path.getmtime(str(path_to_json)):
-        time.sleep(POLLING_FREQUENCY)
-        continue
-    last_update = os.path.getmtime(str(path_to_json))
-    time.sleep(1)  # Waiting for the json to be fully written
-    dict_perf = json.load(path_to_json.open())
-    df = pd.DataFrame(dict_perf)
-    for _, row in df.iterrows():
-        if row["testtask_key"] in logged_rows:
-            continue
-        logged_rows.append(row["testtask_key"])
-        step = int(row["round_idx"]) if row["round_idx"] is not None else int(row["testtask_rank"])
-        log_metric(f"{row['metric_name']}_{row['worker']}", row["performance"], step)

substra_template/requirements.txt DELETED Viewed

@@ -1,13 +0,0 @@
-gradio
-substrafl
-datasets
-torch
-torchvision
-scikit-learn
-numpy==1.23.0
-Pillow
-transformers
-matplotlib
-pandas
-python-dotenv
-./mlflow-2.1.2.dev0-py3-none-any.whl

substra_template/run.sh DELETED Viewed

@@ -1,13 +0,0 @@
-PYTHONPATH=$HOME/app python run_compute_plan.py &
-PYTHONPATH=$HOME/app python mlflow_live_performances.py &
-SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
-# Fix for the UI code being embedded in an iframe
-# Replace window.parent.location.origin with *
-for i in $SITE_PACKAGES/mlflow/server/js/build/static/js/*.js; do
-  sed -i 's/window\.parent\.location\.origin)/"*")/' $i
-  sed 's/window.top?.location.href || window.location.href/window.location.href/g' -i $i
-done
-mlflow ui --port 7860 --host 0.0.0.0

substra_template/run_compute_plan.py DELETED Viewed

@@ -1,40 +0,0 @@
-from substra_helpers.substra_runner import SubstraRunner, algo_generator
-from substra_helpers.model import CNN
-from substra_helpers.dataset import TorchDataset
-from substrafl.strategies import FedAvg
-import torch
-from dotenv import load_dotenv
-import os
-load_dotenv()
-NUM_CLIENTS = int(os.environ["SUBSTRA_NUM_HOSPITALS"])
-seed = 42
-torch.manual_seed(seed)
-model = CNN()
-optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
-criterion = torch.nn.CrossEntropyLoss()
-runner = SubstraRunner(num_clients=NUM_CLIENTS)
-runner.set_up_clients()
-runner.prepare_data()
-runner.register_data()
-runner.register_metric()
-runner.algorithm = algo_generator(
-    model=model,
-    criterion=criterion,
-    optimizer=optimizer,
-    index_generator=runner.index_generator,
-    dataset=TorchDataset,
-    seed=seed
-)()
-runner.strategy = FedAvg()
-runner.set_aggregation()
-runner.set_testing()
-runner.run_compute_plan()

substra_template/substra_helpers/__init__.py DELETED Viewed

File without changes

substra_template/substra_helpers/dataset.py DELETED Viewed

@@ -1,29 +0,0 @@
-import torch
-from torch.utils import data
-import torch.nn.functional as F
-import numpy as np
-class TorchDataset(data.Dataset):
-    def __init__(self, datasamples, is_inference: bool):
-        self.x = datasamples["image"]
-        self.y = datasamples["label"]
-        self.is_inference = is_inference
-    def __getitem__(self, idx):
-        if self.is_inference:
-            x = torch.FloatTensor(np.array(self.x[idx])[None, ...]) / 255
-            return x
-        else:
-            x = torch.FloatTensor(np.array(self.x[idx])[None, ...]) / 255
-            y = torch.tensor(self.y[idx]).type(torch.int64)
-            y = F.one_hot(y, 10)
-            y = y.type(torch.float32)
-            return x, y
-    def __len__(self):
-        return len(self.x)

substra_template/substra_helpers/dataset_assets/description.md DELETED Viewed

@@ -1,18 +0,0 @@
-# Mnist
-This dataset is [THE MNIST DATABASE of handwritten digits](http://yann.lecun.com/exdb/mnist/).
-The target is the number (0 -> 9) represented by the pixels.
-## Data repartition
-### Train and test
-### Split data between organizations
-## Opener usage
-The opener exposes 2 methods:
-- `get_data` returns a dictionary containing the images and the labels as numpy arrays
-- `fake_data` returns a fake data sample of images and labels in a dict

substra_template/substra_helpers/dataset_assets/opener.py DELETED Viewed

@@ -1,20 +0,0 @@
-import numpy as np
-import substratools as tools
-from datasets import load_from_disk
-from transformers import ImageFeatureExtractionMixin
-class MnistOpener(tools.Opener):
-    def fake_data(self, n_samples=None):
-        N_SAMPLES = n_samples if n_samples and n_samples <= 100 else 100
-        fake_images = np.random.randint(256, size=(N_SAMPLES, 28, 28))
-        fake_labels = np.random.randint(10, size=N_SAMPLES)
-        data = {"image": fake_images, "label": fake_labels}
-        return data
-    def get_data(self, folders):
-        return load_from_disk(folders[0])

substra_template/substra_helpers/model.py DELETED Viewed

@@ -1,25 +0,0 @@
-from torch import nn
-import torch.nn.functional as F
-# TODO: Would be cool to use a simple Transformer model... then I could use the Trainer API 👀
-class CNN(nn.Module):
-    def __init__(self):
-        super(CNN, self).__init__()
-        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
-        self.conv2 = nn.Conv2d(32, 32, kernel_size=5)
-        self.conv3 = nn.Conv2d(32, 64, kernel_size=5)
-        self.fc1 = nn.Linear(3 * 3 * 64, 256)
-        self.fc2 = nn.Linear(256, 10)
-    def forward(self, x, eval=False):
-        x = F.relu(self.conv1(x))
-        x = F.relu(F.max_pool2d(self.conv2(x), 2))
-        x = F.dropout(x, p=0.5, training=not eval)
-        x = F.relu(F.max_pool2d(self.conv3(x), 2))
-        x = F.dropout(x, p=0.5, training=not eval)
-        x = x.view(-1, 3 * 3 * 64)
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, p=0.5, training=not eval)
-        x = self.fc2(x)
-        return F.log_softmax(x, dim=1)

substra_template/substra_helpers/substra_runner.py DELETED Viewed

@@ -1,194 +0,0 @@
-import pathlib
-import shutil
-from typing import Optional, List
-from substra import Client, BackendType
-from substra.sdk.schemas import (
-    DatasetSpec,
-    Permissions,
-    DataSampleSpec
-)
-from substrafl.strategies import Strategy
-from substrafl.dependency import Dependency
-from substrafl.remote.register import add_metric
-from substrafl.index_generator import NpIndexGenerator
-from substrafl.algorithms.pytorch import TorchFedAvgAlgo
-from substrafl.nodes import TrainDataNode, AggregationNode, TestDataNode
-from substrafl.evaluation_strategy import EvaluationStrategy
-from substrafl.experiment import execute_experiment
-from substra.sdk.models import ComputePlan
-from datasets import load_dataset, Dataset
-from sklearn.metrics import accuracy_score
-import numpy as np
-import torch
-class SubstraRunner:
-    def __init__(self, num_clients: int):
-        self.num_clients = num_clients
-        self.clients = {}
-        self.algo_provider: Optional[Client] = None
-        self.datasets: List[Dataset] = []
-        self.test_dataset: Optional[Dataset] = None
-        self.path = pathlib.Path(__file__).parent.resolve()
-        self.dataset_keys = {}
-        self.train_data_sample_keys = {}
-        self.test_data_sample_keys = {}
-        self.metric_key: Optional[str] = None
-        NUM_UPDATES = 100
-        BATCH_SIZE = 32
-        self.index_generator = NpIndexGenerator(
-            batch_size=BATCH_SIZE,
-            num_updates=NUM_UPDATES,
-        )
-        self.algorithm: Optional[TorchFedAvgAlgo] = None
-        self.strategy: Optional[Strategy] = None
-        self.aggregation_node: Optional[AggregationNode] = None
-        self.train_data_nodes = list()
-        self.test_data_nodes = list()
-        self.eval_strategy: Optional[EvaluationStrategy] = None
-        self.NUM_ROUNDS = 3
-        self.compute_plan: Optional[ComputePlan] = None
-        self.experiment_folder = self.path / "experiment_summaries"
-    def set_up_clients(self):
-        self.algo_provider = Client(backend_type=BackendType.LOCAL_SUBPROCESS)
-        self.clients = {
-            c.organization_info().organization_id: c
-            for c in [Client(backend_type=BackendType.LOCAL_SUBPROCESS) for _ in range(self.num_clients - 1)]
-        }
-    def prepare_data(self):
-        dataset = load_dataset("mnist", split="train").shuffle()
-        self.datasets = [dataset.shard(num_shards=self.num_clients - 1, index=i) for i in range(self.num_clients - 1)]
-        self.test_dataset = load_dataset("mnist", split="test")
-        data_path = self.path / "data"
-        if data_path.exists() and data_path.is_dir():
-            shutil.rmtree(data_path)
-        for i, client_id in enumerate(self.clients):
-            ds = self.datasets[i]
-            ds.save_to_disk(data_path / client_id / "train")
-            self.test_dataset.save_to_disk(data_path / client_id / "test")
-    def register_data(self):
-        for client_id, client in self.clients.items():
-            permissions_dataset = Permissions(public=False, authorized_ids=[
-                self.algo_provider.organization_info().organization_id
-            ])
-            dataset = DatasetSpec(
-                name="MNIST",
-                type="npy",
-                data_opener=self.path / pathlib.Path("dataset_assets/opener.py"),
-                description=self.path / pathlib.Path("dataset_assets/description.md"),
-                permissions=permissions_dataset,
-                logs_permission=permissions_dataset,
-            )
-            self.dataset_keys[client_id] = client.add_dataset(dataset)
-            assert self.dataset_keys[client_id], "Missing dataset key"
-            self.train_data_sample_keys[client_id] = client.add_data_sample(DataSampleSpec(
-                data_manager_keys=[self.dataset_keys[client_id]],
-                path=self.path / "data" / client_id / "train",
-            ))
-            data_sample = DataSampleSpec(
-                data_manager_keys=[self.dataset_keys[client_id]],
-                path=self.path / "data" / client_id / "test",
-            )
-            self.test_data_sample_keys[client_id] = client.add_data_sample(data_sample)
-    def register_metric(self):
-        permissions_metric = Permissions(
-                                public=False,
-                                authorized_ids=[
-                                   self.algo_provider.organization_info().organization_id
-                                ] + list(self.clients.keys())
-                            )
-        metric_deps = Dependency(pypi_dependencies=["numpy==1.23.1", "scikit-learn==1.1.1"])
-        def accuracy(datasamples, predictions_path):
-            y_true = datasamples["label"]
-            y_pred = np.load(predictions_path)
-            return accuracy_score(y_true, np.argmax(y_pred, axis=1))
-        self.metric_key = add_metric(
-            client=self.algo_provider,
-            metric_function=accuracy,
-            permissions=permissions_metric,
-            dependencies=metric_deps,
-        )
-    def set_aggregation(self):
-        self.aggregation_node = AggregationNode(self.algo_provider.organization_info().organization_id)
-        for org_id in self.clients:
-            train_data_node = TrainDataNode(
-                organization_id=org_id,
-                data_manager_key=self.dataset_keys[org_id],
-                data_sample_keys=[self.train_data_sample_keys[org_id]],
-            )
-            self.train_data_nodes.append(train_data_node)
-    def set_testing(self):
-        for org_id in self.clients:
-            test_data_node = TestDataNode(
-                organization_id=org_id,
-                data_manager_key=self.dataset_keys[org_id],
-                test_data_sample_keys=[self.test_data_sample_keys[org_id]],
-                metric_keys=[self.metric_key],
-            )
-            self.test_data_nodes.append(test_data_node)
-        self.eval_strategy = EvaluationStrategy(test_data_nodes=self.test_data_nodes, rounds=1)
-    def run_compute_plan(self):
-        algo_deps = Dependency(pypi_dependencies=["numpy==1.23.1", "torch==1.11.0"])
-        self.compute_plan = execute_experiment(
-            client=self.algo_provider,
-            algo=self.algorithm,
-            strategy=self.strategy,
-            train_data_nodes=self.train_data_nodes,
-            evaluation_strategy=self.eval_strategy,
-            aggregation_node=self.aggregation_node,
-            num_rounds=self.NUM_ROUNDS,
-            experiment_folder=self.experiment_folder,
-            dependencies=algo_deps,
-        )
-def algo_generator(model, criterion, optimizer, index_generator, dataset, seed):
-    class MyAlgo(TorchFedAvgAlgo):
-        def __init__(self):
-            super().__init__(
-                model=model,
-                criterion=criterion,
-                optimizer=optimizer,
-                index_generator=index_generator,
-                dataset=dataset,
-                seed=seed,
-            )
-    return MyAlgo