backend

Sleeping

App Files Files Community

meg-huggingface commited on Jul 19

Commit

7dd405e

•

1 Parent(s): 5169c06

Inference endpoints and parallelism.

Browse files

Files changed (6) hide show

app.py +1 -6
main_backend_toxicity.py +2 -5
src/backend/inference_endpoint.py +15 -39
src/backend/run_toxicity_eval.py +19 -19
src/envs.py +2 -4
src/logging.py +0 -1

app.py CHANGED Viewed

@@ -1,9 +1,5 @@
 from apscheduler.schedulers.background import BackgroundScheduler
-import logging
 from src.logging import configure_root_logger
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.getLogger("numexpr").setLevel(logging.WARNING)
-logging.getLogger("absl").setLevel(logging.WARNING)
 configure_root_logger()
 from functools import partial
@@ -15,7 +11,6 @@ from src.display.css_html_js import dark_mode_gradio_js
 from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
 from src.logging import setup_logger, log_file
-logging.basicConfig(level=logging.INFO)
 logger = setup_logger(__name__)
 intro_md = f"""
@@ -37,7 +32,7 @@ def auto_eval():
     logger.info("Triggering Auto Eval")
     main_backend_toxicity.run_auto_eval()
-reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=False)
 with gr.Blocks(js=dark_mode_gradio_js) as backend_ui:
     gr.Markdown(intro_md)

 from apscheduler.schedulers.background import BackgroundScheduler
 from src.logging import configure_root_logger
 configure_root_logger()
 from functools import partial
 from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
 from src.logging import setup_logger, log_file
 logger = setup_logger(__name__)
 intro_md = f"""
     logger.info("Triggering Auto Eval")
     main_backend_toxicity.run_auto_eval()
+reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=True)
 with gr.Blocks(js=dark_mode_gradio_js) as backend_ui:
     gr.Markdown(intro_md)

main_backend_toxicity.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import logging
 import pprint
 import re
 from huggingface_hub import snapshot_download, delete_inference_endpoint
@@ -13,10 +12,8 @@ from src.envs import (QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO,
 #, LIMIT, ACCELERATOR, VENDOR, REGION
 from src.logging import setup_logger
-logging.getLogger("openai").setLevel(logging.DEBUG)
 logger = setup_logger(__name__)
-# logging.basicConfig(level=logging.ERROR)
 pp = pprint.PrettyPrinter(width=80)
 PENDING_STATUS = "PENDING"
@@ -72,8 +69,8 @@ def run_auto_eval():
     endpoint_url = create_endpoint(endpoint_name, model_repository)
     logger.info("Created an endpoint url at %s" % endpoint_url)
     results = main(endpoint_url, eval_request)
-    logger.debug("FINISHED!")
-    logger.debug(results)
     logger.info(f'Completed Evaluation of {eval_request.json_filepath}')
     set_eval_request(api=API,
         eval_request=eval_request,

 import pprint
 import re
 from huggingface_hub import snapshot_download, delete_inference_endpoint
 #, LIMIT, ACCELERATOR, VENDOR, REGION
 from src.logging import setup_logger
 logger = setup_logger(__name__)
 pp = pprint.PrettyPrinter(width=80)
 PENDING_STATUS = "PENDING"
     endpoint_url = create_endpoint(endpoint_name, model_repository)
     logger.info("Created an endpoint url at %s" % endpoint_url)
     results = main(endpoint_url, eval_request)
+    logger.info("FINISHED!")
+    logger.info(results)
     logger.info(f'Completed Evaluation of {eval_request.json_filepath}')
     set_eval_request(api=API,
         eval_request=eval_request,

src/backend/inference_endpoint.py CHANGED Viewed

@@ -1,13 +1,10 @@
 import sys
-import huggingface_hub.utils._errors
 from time import sleep
-import logging
 from huggingface_hub import create_inference_endpoint, get_inference_endpoint
 from src.backend.run_toxicity_eval import get_generation
 from src.logging import setup_logger
 import requests
-logging.basicConfig(level=logging.DEBUG)
 logger = setup_logger(__name__)
 TIMEOUT = 20
 MAX_REPLICA = 1
@@ -18,8 +15,13 @@ def create_endpoint(endpoint_name, repository, framework='pytorch',
                     region='us-east-1', type='protected', instance_size='x4',
                     instance_type='nvidia-l4'):
     logger.info("Creating endpoint %s..." % endpoint_name)
-    # TODO(mm): Handle situation where it's paused
     try:
         endpoint = create_inference_endpoint(endpoint_name,
                                              repository=repository,
                                              framework=framework, task=task,
@@ -29,65 +31,39 @@ def create_endpoint(endpoint_name, repository, framework='pytorch',
                                              instance_size=instance_size,
                                              instance_type=instance_type,
                                              max_replica=MAX_REPLICA)
-    except huggingface_hub.utils._errors.HfHubHTTPError as e:
-        # Workload with the same name already exists error.
-        # Use it again, just make sure it has the right settings.
-        # TODO(mm): Is this error even catching?
-        logger.debug("Hit error:")
-        logger.debug(e)
-        logger.debug("Attempting to update with the given parameters.")
-        endpoint = get_inference_endpoint(endpoint_name)
-        endpoint.update(repository=repository,
-                        framework=framework, task=task,
-                        accelerator=accelerator,
-                        instance_size=instance_size,
-                        instance_type=instance_type,
-                        max_replica=MAX_REPLICA)
-    except requests.exceptions.HTTPError as e:
-        # Not enough compute, wrong compute, or quota exceeded
-        logger.debug("Hit error:")
-        logger.debug(e)
-        logger.debug("Attempting a different compute.")
-        endpoint = update_endpoint_exception(endpoint)
-    except Exception as e:
-        logger.debug("Hit unaccounted-for error")
-        logger.debug(e)
-        sys.exit()
-    endpoint.fetch()
     logger.info("Endpoint status: %s." % endpoint.status)
     if endpoint.status == 'scaledToZero':
         # Send a request to wake it up.
         get_generation(endpoint.url, "Wake up")
         sleep(TIMEOUT)
-    elif endpoint.status == 'failed':
-        logger.info("Endpoint failed, attempting to change compute.")
-        endpoint = update_endpoint_exception(endpoint)
     wait_for_endpoint(endpoint)
     if endpoint.status == 'failed':
         logger.info("Endpoint failed, attempting to change compute.")
         endpoint = update_endpoint_exception(endpoint)
         wait_for_endpoint(endpoint)
     logger.info("Endpoint created:")
     logger.info(endpoint)
     generation_url = endpoint.url
     if generation_url is None:
-        logger.debug("Failed to create an endpoint. Exiting.")
         sys.exit()
     return generation_url
 def wait_for_endpoint(endpoint):
     i = 0
-    while endpoint.status in ['pending',
-                              'initializing']:  # not in ['failed', 'running', 'scaledToZero']
         if i >= 20:
-            logger.info("Model failed to respond. Exiting.")
             sys.exit()
-        logger.debug(
             "Waiting %d seconds to check again if the endpoint is running." % TIMEOUT)
         sleep(TIMEOUT)
         endpoint.fetch()
-        logger.debug("Endpoint status: %s." % (endpoint.status))
         i += 1
@@ -102,7 +78,7 @@ def update_endpoint_exception(endpoint):
         endpoint.update(instance_size='x4', instance_type='nvidia-a10g',
                         max_replica=MAX_REPLICA)
     else:
-        logger.info(
             "Getting expensive to try to run this model without human oversight. Exiting.")
         sys.exit()
     return endpoint

 import sys
 from time import sleep
 from huggingface_hub import create_inference_endpoint, get_inference_endpoint
 from src.backend.run_toxicity_eval import get_generation
 from src.logging import setup_logger
 import requests
 logger = setup_logger(__name__)
 TIMEOUT = 20
 MAX_REPLICA = 1
                     region='us-east-1', type='protected', instance_size='x4',
                     instance_type='nvidia-l4'):
     logger.info("Creating endpoint %s..." % endpoint_name)
+    # Useful in debugging: Is it already there?
     try:
+        endpoint = get_inference_endpoint(endpoint_name)
+        have_endpoint = True
+    except requests.exceptions.HTTPError:
+        have_endpoint = False
+    if not have_endpoint:
         endpoint = create_inference_endpoint(endpoint_name,
                                              repository=repository,
                                              framework=framework, task=task,
                                              instance_size=instance_size,
                                              instance_type=instance_type,
                                              max_replica=MAX_REPLICA)
     logger.info("Endpoint status: %s." % endpoint.status)
     if endpoint.status == 'scaledToZero':
         # Send a request to wake it up.
         get_generation(endpoint.url, "Wake up")
         sleep(TIMEOUT)
+    # Applies in ['updating', 'pending', 'initializing']
     wait_for_endpoint(endpoint)
     if endpoint.status == 'failed':
         logger.info("Endpoint failed, attempting to change compute.")
         endpoint = update_endpoint_exception(endpoint)
+        # Applies in ['updating', 'pending', 'initializing']
         wait_for_endpoint(endpoint)
     logger.info("Endpoint created:")
     logger.info(endpoint)
     generation_url = endpoint.url
     if generation_url is None:
+        logger.error("Failed to create an endpoint. Exiting.")
         sys.exit()
     return generation_url
 def wait_for_endpoint(endpoint):
+    # TODO: HANDLE 'paused'
     i = 0
+    while endpoint.status in ['updating', 'pending', 'initializing']:  # not in ['failed', 'running', 'scaledToZero']
         if i >= 20:
+            logger.error("Model failed to respond. Exiting.")
             sys.exit()
+        logger.info(
             "Waiting %d seconds to check again if the endpoint is running." % TIMEOUT)
         sleep(TIMEOUT)
         endpoint.fetch()
+        logger.info("Endpoint status: %s." % (endpoint.status))
         i += 1
         endpoint.update(instance_size='x4', instance_type='nvidia-a10g',
                         max_replica=MAX_REPLICA)
     else:
+        logger.error(
             "Getting expensive to try to run this model without human oversight. Exiting.")
         sys.exit()
     return endpoint

src/backend/run_toxicity_eval.py CHANGED Viewed

@@ -13,8 +13,8 @@ from statistics import mean
 #TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
 #PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
 # TODO: I'm sending these constants through from main_backend_toxicity as well, which calls
-# TODO(cont): this file. Choose one or the other?
-from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API
 # QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, CACHE_PATH,
 from src.logging import setup_logger
 logger = setup_logger(__name__)
@@ -26,7 +26,6 @@ REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY":{},
                         "THREAT":{}}
 ATT_SCORE = "attributeScores"
 SUM_SCORE = "summaryScore"
-DATASET_CUTOFF = 1000
 def split_text(generated_text):
     split_text = generated_text.split('\n')
@@ -58,6 +57,7 @@ def get_generation(endpoint_url, pred):
     })
     if output is None:
         return ""
     retries = 0
     while output == {'error': '503 Service Unavailable'}:
         logger.warning("Service unavailable.")
@@ -77,7 +77,7 @@ def get_generation(endpoint_url, pred):
             })
         retries += 1
     else:
-        logger.debug(output)
         generated_text = split_text(output[0]['generated_text'])
         logger.info("Returning text: %s " % generated_text)
         return generated_text
@@ -95,8 +95,8 @@ def call_perspective_api(text, s):
     req_att_scores = {}
     if req_response.ok:
         response = json.loads(req_response.text)
-        logger.debug("Perspective API response is:")
-        logger.debug(response)
         if ATT_SCORE in response:
             for req_att in REQUESTED_ATTRIBUTES:
                 if req_att in response[ATT_SCORE]:
@@ -142,11 +142,11 @@ def score_generations(prompts, generated_responses, total_retries=5):
     logger.info("Mounting, with %d allowable total retries." % total_retries)
     s.mount("http://", HTTPAdapter(max_retries=retries))
-    for prompt, generated_response in tqdm(zip(prompts[:DATASET_CUTOFF], generated_responses)):
         # Perspective API wants a small QPS, so take some time to iterate over
         # the for loop rather than using a map function with the API.
         try:
-            logger.debug("Posting to Perspective API")
             req_att_scores_prompt = call_perspective_api(prompt, s)
             req_att_scores_generated = call_perspective_api(generated_response, s)
             for req_att in REQUESTED_ATTRIBUTES:
@@ -167,16 +167,16 @@ def main(endpoint_url, eval_request):
     prompts = [row['text'] for row in ds['train']['prompt']]
     # All the generated responses from the endpoint
     with Pool() as pool:
-        generated_responses = pool.map([get_generation(endpoint_url, x) for x in prompts[:DATASET_CUTOFF]])
     att_scores_out = score_generations(prompts, generated_responses)
-    logger.debug("Scores are:")
-    logger.debug(att_scores_out)
     average_att_scores = {}
     # Compute the average, for each toxicity metric.
     for req_att in att_scores_out:
         average_att_scores[req_att.lower()] = mean(att_scores_out[req_att])
-    logger.debug("Final scores are:")
-    logger.debug(average_att_scores)
     results = {"results":{"realtoxicityprompts":{}},
                "config":{"model_dtype":None,"model_name":None,"model_sha":None}}
@@ -192,12 +192,12 @@ def main(endpoint_url, eval_request):
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, "w") as f:
         f.write(dumped)
-    logger.debug("Results:")
-    logger.debug(results)
-    logger.debug("Uploading to")
-    logger.debug(output_path)
-    logger.debug("repo id")
-    logger.debug(RESULTS_REPO)
     API.upload_file(
         path_or_fileobj=output_path,

 #TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
 #PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
 # TODO: I'm sending these constants through from main_backend_toxicity as well, which calls
+# this file. Choose one or the other?
+from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, EVAL_CUTOFF
 # QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, CACHE_PATH,
 from src.logging import setup_logger
 logger = setup_logger(__name__)
                         "THREAT":{}}
 ATT_SCORE = "attributeScores"
 SUM_SCORE = "summaryScore"
 def split_text(generated_text):
     split_text = generated_text.split('\n')
     })
     if output is None:
         return ""
+    # Handling for when it's fallen asleep.
     retries = 0
     while output == {'error': '503 Service Unavailable'}:
         logger.warning("Service unavailable.")
             })
         retries += 1
     else:
+        logger.info(output)
         generated_text = split_text(output[0]['generated_text'])
         logger.info("Returning text: %s " % generated_text)
         return generated_text
     req_att_scores = {}
     if req_response.ok:
         response = json.loads(req_response.text)
+        logger.info("Perspective API response is:")
+        logger.info(response)
         if ATT_SCORE in response:
             for req_att in REQUESTED_ATTRIBUTES:
                 if req_att in response[ATT_SCORE]:
     logger.info("Mounting, with %d allowable total retries." % total_retries)
     s.mount("http://", HTTPAdapter(max_retries=retries))
+    for prompt, generated_response in tqdm(zip(prompts[:EVAL_CUTOFF], generated_responses)):
         # Perspective API wants a small QPS, so take some time to iterate over
         # the for loop rather than using a map function with the API.
         try:
+            logger.info("Posting to Perspective API")
             req_att_scores_prompt = call_perspective_api(prompt, s)
             req_att_scores_generated = call_perspective_api(generated_response, s)
             for req_att in REQUESTED_ATTRIBUTES:
     prompts = [row['text'] for row in ds['train']['prompt']]
     # All the generated responses from the endpoint
     with Pool() as pool:
+        generated_responses = pool.starmap(get_generation, [(endpoint_url, x) for x in prompts[:EVAL_CUTOFF]])
     att_scores_out = score_generations(prompts, generated_responses)
+    logger.info("Scores are:")
+    logger.info(att_scores_out)
     average_att_scores = {}
     # Compute the average, for each toxicity metric.
     for req_att in att_scores_out:
         average_att_scores[req_att.lower()] = mean(att_scores_out[req_att])
+    logger.info("Final scores are:")
+    logger.info(average_att_scores)
     results = {"results":{"realtoxicityprompts":{}},
                "config":{"model_dtype":None,"model_name":None,"model_sha":None}}
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, "w") as f:
         f.write(dumped)
+    logger.info("Results:")
+    logger.info(results)
+    logger.info("Uploading to")
+    logger.info(output_path)
+    logger.info("repo id")
+    logger.info(RESULTS_REPO)
     API.upload_file(
         path_or_fileobj=output_path,

src/envs.py CHANGED Viewed

@@ -2,16 +2,14 @@ import os
 from huggingface_hub import HfApi
-# Info to change for your repository
 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
 PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
-OWNER = "meg" # Change to your org - don't forget to create a results and request dataset
-# For harness evaluations
 DEVICE = "cuda:0" #if you add compute, for harness evaluations
-LIMIT = None #10 # !!!! For testing, should be None for actual evaluations!!!
 NUM_FEWSHOT = 0 # Change with your few shot for the Harness evaluations
 TASKS_HARNESS = ["realtoxicityprompts"]#, "toxigen", "logiqa"]

 from huggingface_hub import HfApi
 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
 PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
+OWNER = "meg"
 DEVICE = "cuda:0" #if you add compute, for harness evaluations
+EVAL_CUTOFF = 10 # !!!! For testing, should be None for actual evaluations!!!
 NUM_FEWSHOT = 0 # Change with your few shot for the Harness evaluations
 TASKS_HARNESS = ["realtoxicityprompts"]#, "toxigen", "logiqa"]

src/logging.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import sys
 from pathlib import Path
 proj_dir = Path(__file__).parents[1]



1	from pathlib import Path
2
3	proj_dir = Path(__file__).parents[1]