support-cpu (#2)

- set use_flash_attn if not available (71ef01733c24797743bbc24b9c39661cb4a132e2)
- use getattr function (695207d827c7094bf9b0f7d7c048692b0633488e)

Files changed (2) hide show

mha.py +0 -2
modeling_xlm_roberta.py +16 -3

mha.py CHANGED Viewed

@@ -10,8 +10,6 @@ import torch
 import torch.nn as nn
 from einops import rearrange, repeat
-from flash_attn.utils.distributed import get_dim_for_local_rank
 try:
     from flash_attn import (
         flash_attn_kvpacked_func,

 import torch.nn as nn
 from einops import rearrange, repeat
 try:
     from flash_attn import (
         flash_attn_kvpacked_func,

modeling_xlm_roberta.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # This implementation was adopted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/models/bert.py
 # Commit id: abbc1311731867310635f9edc2a9ec18317c8c48
 # Copyright (c) 2022, Tri Dao.
 # This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation.
 # https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
@@ -8,6 +7,7 @@
 # Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
 import logging
 import re
 from collections import OrderedDict
@@ -65,8 +65,21 @@ except ImportError:
 logger = logging.getLogger(__name__)
 def create_mixer_cls(config, cross_attn=False, return_residual=False):
-    use_flash_attn = getattr(config, "use_flash_attn", False)
     fused_bias_fc = getattr(config, "fused_bias_fc", False)
     rotary_kwargs = {}
     if config.position_embedding_type == "rotary":
@@ -169,7 +182,7 @@ def _init_weights(module, initializer_range=0.02):
 class XLMRobertaEncoder(nn.Module):
     def __init__(self, config: XLMRobertaFlashConfig):
         super().__init__()
-        self.use_flash_attn = getattr(config, "use_flash_attn", False)
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )

 # This implementation was adopted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/models/bert.py
 # Commit id: abbc1311731867310635f9edc2a9ec18317c8c48
 # Copyright (c) 2022, Tri Dao.
 # This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation.
 # https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
 # Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
+import importlib.util
 import logging
 import re
 from collections import OrderedDict
 logger = logging.getLogger(__name__)
+def get_use_flash_attn(config: XLMRobertaFlashConfig):
+    if not getattr(config, "use_flash_attn", False):
+        return False
+    if not torch.cuda.is_available():
+        return False
+    if importlib.util.find_spec("flash_attn") is None:
+        logger.warning(
+            'flash_attn is not installed. Using PyTorch native attention implementation.'
+        )
+        return False
+    return True
 def create_mixer_cls(config, cross_attn=False, return_residual=False):
+    use_flash_attn = get_use_flash_attn(config)
     fused_bias_fc = getattr(config, "fused_bias_fc", False)
     rotary_kwargs = {}
     if config.position_embedding_type == "rotary":
 class XLMRobertaEncoder(nn.Module):
     def __init__(self, config: XLMRobertaFlashConfig):
         super().__init__()
+        self.use_flash_attn = get_use_flash_attn(config)
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )