muellerzr HF staff commited on
Commit
e44403a
1 Parent(s): be6343c

Big refactor

Browse files
Files changed (8) hide show
  1. Makefile +11 -0
  2. README.md +1 -1
  3. app.py +0 -187
  4. pyproject.toml +16 -0
  5. src/__init__.py +0 -0
  6. src/app.py +74 -0
  7. src/hub_utils.py +62 -0
  8. src/model_utils.py +85 -0
Makefile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ check_dirs := src
2
+
3
+ # this target runs checks on all files
4
+ quality:
5
+ black --required-version 23 --check $(check_dirs)
6
+ ruff $(check_dirs)
7
+
8
+ # Format source code automatically and check is there are any problems left that need manual fixing
9
+ style:
10
+ black --required-version 23 $(check_dirs)
11
+ ruff $(check_dirs) --fix
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.40.1
8
- app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
5
  colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.40.1
8
+ app_file: src/app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
app.py DELETED
@@ -1,187 +0,0 @@
1
- import os
2
- import re
3
- import webbrowser
4
- import pandas as pd
5
- import gradio as gr
6
- from huggingface_hub import HfApi
7
- from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
8
- from accelerate.commands.estimate import create_empty_model, check_has_model
9
- from accelerate.utils import convert_bytes, calculate_maximum_sizes
10
- from urllib.parse import urlparse
11
-
12
- # We need to store them as globals because gradio doesn't have a way for us to pass them in to the button
13
- HAS_DISCUSSION = True
14
- MODEL_NAME = None
15
- LIBRARY = None
16
- USER_TOKEN = None
17
- TOKEN = os.environ.get("HUGGINGFACE_API_LOGIN", None)
18
-
19
- def translate_llama2(text):
20
- "Translates llama-2 to its hf counterpart"
21
- if not text.endswith("-hf"):
22
- return text + "-hf"
23
- return text
24
-
25
- def check_for_discussion(model_name:str):
26
- "Checks if an automated discussion has been opened on the model by `model-sizer-bot`"
27
- global TOKEN
28
- api = HfApi(token=TOKEN)
29
- discussions = list(api.get_repo_discussions(model_name))
30
- return any(discussion.title == "[AUTOMATED] Model Memory Requirements" and discussion.author == "model-sizer-bot" for discussion in discussions)
31
-
32
- def report_results():
33
- "Reports the results of a memory calculation to the model's discussion page, and opens a new tab to it afterwards"
34
- global MODEL_NAME, LIBRARY, TOKEN, USER_TOKEN
35
- api = HfApi(token=TOKEN)
36
- results, data = calculate_memory(MODEL_NAME, LIBRARY, ["fp32", "fp16", "int8", "int4"], access_token=USER_TOKEN, raw=True)
37
- minimum = data[0]
38
-
39
- USER_TOKEN = None
40
- post = f"""# Model Memory Requirements\n
41
-
42
- You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
43
-
44
- These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
45
-
46
- The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
47
- When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
48
-
49
- When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
50
-
51
- ## Results:
52
-
53
- {results}
54
- """
55
- discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post)
56
- webbrowser.open_new_tab(discussion.url)
57
-
58
- def extract_from_url(name:str):
59
- "Checks if `name` is a URL, and if so converts it to a model name"
60
- is_url = False
61
- try:
62
- result = urlparse(name)
63
- is_url = all([result.scheme, result.netloc])
64
- except:
65
- is_url = False
66
- # Pass through if not a URL
67
- if not is_url:
68
- return name
69
- else:
70
- path = result.path
71
- return path[1:]
72
-
73
- def calculate_memory(model_name:str, library:str, options:list, access_token:str, raw=False):
74
- "Calculates the memory usage for a model"
75
- if "meta-llama" in model_name:
76
- model_name = translate_llama2(model_name)
77
- if library == "auto":
78
- library = None
79
- model_name = extract_from_url(model_name)
80
- try:
81
- model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token)
82
- except GatedRepoError:
83
- raise gr.Error(f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. ")
84
- except RepositoryNotFoundError:
85
- raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.")
86
- except ValueError as e:
87
- raise gr.Error(f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)")
88
- except (RuntimeError, OSError) as e:
89
- library = check_has_model(e)
90
- if library != "unknown":
91
- raise gr.Error(f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo.")
92
- raise gr.Error(f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`")
93
- except ImportError:
94
- # hacky way to check if it works with `trust_remote_code=False`
95
- model = create_empty_model(model_name, library_name=library, trust_remote_code=False, access_token=access_token)
96
- except Exception as e:
97
- raise gr.Error(f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`")
98
- total_size, largest_layer = calculate_maximum_sizes(model)
99
-
100
- data = []
101
-
102
- title = f"Memory Usage for '{model_name}'"
103
- for dtype in options:
104
- dtype_total_size = total_size
105
- dtype_largest_layer = largest_layer[0]
106
- if dtype in ("fp16", "bf16", "float16/bfloat16"):
107
- dtype_total_size /= 2
108
- dtype_largest_layer /= 2
109
- elif dtype == "int8":
110
- dtype_total_size /= 4
111
- dtype_largest_layer /= 4
112
- elif dtype == "int4":
113
- dtype_total_size /= 8
114
- dtype_largest_layer /= 8
115
- dtype_training_size = convert_bytes(dtype_total_size * 4)
116
- dtype_total_size = convert_bytes(dtype_total_size)
117
- dtype_largest_layer = convert_bytes(dtype_largest_layer)
118
- data.append({
119
- "dtype": dtype,
120
- "Largest Layer or Residual Group": dtype_largest_layer,
121
- "Total Size": dtype_total_size,
122
- "Training using Adam": dtype_training_size
123
- })
124
- global HAS_DISCUSSION, MODEL_NAME, LIBRARY
125
- HAS_DISCUSSION = check_for_discussion(model_name)
126
- MODEL_NAME = model_name
127
- LIBRARY = library
128
-
129
- if raw:
130
- return pd.DataFrame(data).to_markdown(index=False), data
131
-
132
- results = [
133
- f'## {title}',
134
- gr.update(visible=True, value=pd.DataFrame(data)),
135
- gr.update(visible=not HAS_DISCUSSION)
136
- ]
137
- return results
138
-
139
- with gr.Blocks() as demo:
140
- with gr.Column():
141
- gr.Markdown(
142
- """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
143
-
144
- This tool will help you calculate how much vRAM is needed to train and perform big model inference
145
- on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
146
- is denoted as the size of the "largest layer", and training of a model is roughly 4x its size (for Adam).
147
-
148
- These calculations are accurate within a few percent at most, such as `bert-base-cased` being 413.68 MB and the calculator estimating 413.18 MB.
149
-
150
- When performing inference, expect to add up to an additional 20% to this as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/).
151
- More tests will be performed in the future to get a more accurate benchmark for each model.
152
-
153
- Currently this tool supports all models hosted that use `transformers` and `timm`.
154
-
155
- To use this tool pass in the URL or model name of the model you want to calculate the memory usage for,
156
- select which framework it originates from ("auto" will try and detect it from the model metadata), and
157
- what precisions you want to use."""
158
- )
159
- out_text = gr.Markdown()
160
- out = gr.DataFrame(
161
- headers=["dtype", "Largest Layer", "Total Size", "Training using Adam"],
162
- interactive=False,
163
- visible=False,
164
- )
165
- with gr.Row():
166
- inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
167
- with gr.Row():
168
- library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
169
- options = gr.CheckboxGroup(
170
- ["float32", "float16/bfloat16", "int8", "int4"],
171
- value="float32",
172
- label="Model Precision",
173
- )
174
- access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
175
- with gr.Row():
176
- btn = gr.Button("Calculate Memory Usage")
177
- post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False)
178
- USER_TOKEN = access_token
179
-
180
- btn.click(
181
- calculate_memory, inputs=[inp, library, options, access_token], outputs=[out_text, out, post_to_hub],
182
- )
183
-
184
- post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub)
185
-
186
-
187
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.black]
2
+ line-length = 119
3
+ target-version = ['py37']
4
+
5
+ [tool.ruff]
6
+ # Never enforce `E501` (line length violations).
7
+ ignore = ["E501", "E741", "W605"]
8
+ select = ["E", "F", "I", "W"]
9
+ line-length = 119
10
+
11
+ # Ignore import violations in all `__init__.py` files.
12
+ [tool.ruff.per-file-ignores]
13
+ "__init__.py" = ["E402", "F401", "F403", "F811"]
14
+
15
+ [tool.ruff.isort]
16
+ lines-after-imports = 2
src/__init__.py ADDED
File without changes
src/app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ from .hub_utils import check_for_discussion, report_results
5
+ from .model_utils import calculate_memory, get_model
6
+
7
+
8
+ # We need to store them as globals because gradio doesn't have a way for us to pass them in to the button
9
+ MODEL = None
10
+
11
+
12
+ def get_results(model_name: str, library: str, options: list, access_token: str):
13
+ global MODEL
14
+ MODEL = get_model(model_name, library, access_token)
15
+ has_discussion = check_for_discussion(model_name)
16
+ title = f"## Memory usage for '{model_name}'"
17
+ data = calculate_memory(MODEL, options)
18
+ return [title, gr.update(visible=True, value=pd.DataFrame(data)), gr.update(visible=not has_discussion)]
19
+
20
+
21
+ with gr.Blocks() as demo:
22
+ with gr.Column():
23
+ gr.Markdown(
24
+ """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
25
+
26
+ This tool will help you calculate how much vRAM is needed to train and perform big model inference
27
+ on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
28
+ is denoted as the size of the "largest layer", and training of a model is roughly 4x its size (for Adam).
29
+
30
+ These calculations are accurate within a few percent at most, such as `bert-base-cased` being 413.68 MB and the calculator estimating 413.18 MB.
31
+
32
+ When performing inference, expect to add up to an additional 20% to this as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/).
33
+ More tests will be performed in the future to get a more accurate benchmark for each model.
34
+
35
+ Currently this tool supports all models hosted that use `transformers` and `timm`.
36
+
37
+ To use this tool pass in the URL or model name of the model you want to calculate the memory usage for,
38
+ select which framework it originates from ("auto" will try and detect it from the model metadata), and
39
+ what precisions you want to use."""
40
+ )
41
+ out_text = gr.Markdown()
42
+ out = gr.DataFrame(
43
+ headers=["dtype", "Largest Layer", "Total Size", "Training using Adam"],
44
+ interactive=False,
45
+ visible=False,
46
+ )
47
+ with gr.Row():
48
+ inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
49
+ with gr.Row():
50
+ library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
51
+ options = gr.CheckboxGroup(
52
+ ["float32", "float16/bfloat16", "int8", "int4"],
53
+ value="float32",
54
+ label="Model Precision",
55
+ )
56
+ access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
57
+ with gr.Row():
58
+ btn = gr.Button("Calculate Memory Usage")
59
+ post_to_hub = gr.Button(
60
+ value="Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False
61
+ )
62
+
63
+ btn.click(
64
+ get_results,
65
+ inputs=[inp, library, options, access_token],
66
+ outputs=[out_text, out, post_to_hub],
67
+ )
68
+
69
+ post_to_hub.click(report_results, inputs=[inp, library, access_token]).then(
70
+ lambda: gr.Button.update(visible=False), outputs=post_to_hub
71
+ )
72
+
73
+
74
+ demo.launch()
src/hub_utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utilities related to searching and posting on the Hub
2
+ import os
3
+ import webbrowser
4
+ from urllib.parse import urlparse
5
+
6
+ import pandas as pd
7
+ from huggingface_hub import HfApi
8
+
9
+ from .model_utils import calculate_memory, get_model
10
+
11
+
12
+ def extract_from_url(name: str):
13
+ "Checks if `name` is a URL, and if so converts it to a model name"
14
+ is_url = False
15
+ try:
16
+ result = urlparse(name)
17
+ is_url = all([result.scheme, result.netloc])
18
+ except Exception:
19
+ is_url = False
20
+ # Pass through if not a URL
21
+ if not is_url:
22
+ return name
23
+ else:
24
+ path = result.path
25
+ return path[1:]
26
+
27
+
28
+ def check_for_discussion(model_name: str):
29
+ "Checks if an automated discussion has been opened on the model by `model-sizer-bot`"
30
+ api = HfApi(token=os.environ.get("HUGGINGFACE_API_LOGIN", None))
31
+ discussions = list(api.get_repo_discussions(model_name))
32
+ return any(
33
+ discussion.title == "[AUTOMATED] Model Memory Requirements" and discussion.author == "model-sizer-bot"
34
+ for discussion in discussions
35
+ )
36
+
37
+
38
+ def report_results(model_name, library, access_token):
39
+ "Reports the results of a memory calculation to the model's discussion page, and opens a new tab to it afterwards"
40
+ model = get_model(model_name, library, access_token)
41
+ data = calculate_memory(model, ["fp32", "fp16", "int8", "int4"])
42
+ minimum = data[0]
43
+ data = pd.DataFrame(data).to_markdown(index=False)
44
+
45
+ post = f"""# Model Memory Requirements\n
46
+
47
+ You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
48
+
49
+ These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
50
+
51
+ The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
52
+ When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
53
+
54
+ When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
55
+
56
+ ## Results:
57
+
58
+ {data}
59
+ """
60
+ api = HfApi(token=os.environ.get("HUGGINGFACE_API_LOGIN", None))
61
+ discussion = api.create_discussion(model_name, "[AUTOMATED] Model Memory Requirements", description=post)
62
+ webbrowser.open_new_tab(discussion.url)
src/model_utils.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utilities related to loading in and working with models/specific models
2
+ import gradio as gr
3
+ import torch
4
+ from accelerate.commands.estimate import check_has_model, create_empty_model
5
+ from accelerate.utils import calculate_maximum_sizes, convert_bytes
6
+ from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
7
+
8
+ from .hub_utils import extract_from_url
9
+
10
+
11
+ DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
12
+
13
+
14
+ def translate_llama2(text):
15
+ "Translates llama-2 to its hf counterpart"
16
+ if not text.endswith("-hf"):
17
+ return text + "-hf"
18
+ return text
19
+
20
+
21
+ def get_model(model_name: str, library: str, access_token: str):
22
+ "Finds and grabs model from the Hub, and initializes on `meta`"
23
+ if "meta-llama" in model_name:
24
+ model_name = translate_llama2(model_name)
25
+ if library == "auto":
26
+ library = None
27
+ model_name = extract_from_url(model_name)
28
+ try:
29
+ model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token)
30
+ except GatedRepoError:
31
+ raise gr.Error(
32
+ f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. "
33
+ )
34
+ except RepositoryNotFoundError:
35
+ raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.")
36
+ except ValueError:
37
+ raise gr.Error(
38
+ f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)"
39
+ )
40
+ except (RuntimeError, OSError) as e:
41
+ library = check_has_model(e)
42
+ if library != "unknown":
43
+ raise gr.Error(
44
+ f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo."
45
+ )
46
+ raise gr.Error(
47
+ f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
48
+ )
49
+ except ImportError:
50
+ # hacky way to check if it works with `trust_remote_code=False`
51
+ model = create_empty_model(
52
+ model_name, library_name=library, trust_remote_code=False, access_token=access_token
53
+ )
54
+ except Exception as e:
55
+ raise gr.Error(
56
+ f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
57
+ )
58
+ return model
59
+
60
+
61
+ def calculate_memory(model: torch.nn.Module, options: list):
62
+ "Calculates the memory usage for a model init on `meta` device"
63
+ total_size, largest_layer = calculate_maximum_sizes(model)
64
+
65
+ data = []
66
+ for dtype in options:
67
+ dtype_total_size = total_size
68
+ dtype_largest_layer = largest_layer[0]
69
+
70
+ modifier = DTYPE_MODIFIER[dtype]
71
+ dtype_total_size /= modifier
72
+ dtype_largest_layer /= modifier
73
+
74
+ dtype_training_size = convert_bytes(dtype_total_size * 4)
75
+ dtype_total_size = convert_bytes(dtype_total_size)
76
+ dtype_largest_layer = convert_bytes(dtype_largest_layer)
77
+ data.append(
78
+ {
79
+ "dtype": dtype,
80
+ "Largest Layer or Residual Group": dtype_largest_layer,
81
+ "Total Size": dtype_total_size,
82
+ "Training using Adam": dtype_training_size,
83
+ }
84
+ )
85
+ return data