diff --git "a/assets/index-DCR9Hsyd.js" "b/assets/index-DCR9Hsyd.js" --- "a/assets/index-DCR9Hsyd.js" +++ "b/assets/index-DCR9Hsyd.js" @@ -2370,4 +2370,4 @@ If a question does not make any sense, or is not factually coherent, explain why `,"\\n").replaceAll("'","\\'"))}}class xv extends Tg{}class Sv extends Ee{}class kv extends Ee{}class Ev extends Ee{}class Cv extends Ee{}class Tv extends Ee{}class Av extends Ee{}class Iv extends Ee{_default_chat_template=`{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + ' ' + message['content'] | trim + ' ' }}{% endfor %}{% if add_generation_prompt %}{{'model -'}}{% endif %}`}class Mv extends Ee{}function pl(t,e,r,n){if(!("language_codes"in t)||!Array.isArray(t.language_codes))throw new Error("Tokenizer must have `language_codes` attribute set and it should be an array of language ids.");if(!("languageRegex"in t)||!(t.languageRegex instanceof RegExp))throw new Error("Tokenizer must have `languageRegex` attribute set and it should be a regular expression.");if(!("lang_to_token"in t)||typeof t.lang_to_token!="function")throw new Error("Tokenizer must have `lang_to_token` attribute set and it should be a function.");const a=n.src_lang,i=n.tgt_lang;if(!t.language_codes.includes(i))throw new Error(`Target language code "${i}" is not valid. Must be one of: {${t.language_codes.join(", ")}}`);if(a!==void 0){if(!t.language_codes.includes(a))throw new Error(`Source language code "${a}" is not valid. Must be one of: {${t.language_codes.join(", ")}}`);for(const s of t.post_processor.config.single)if("SpecialToken"in s&&t.languageRegex.test(s.SpecialToken.id)){s.SpecialToken.id=t.lang_to_token(a);break}}return n.forced_bos_token_id=t.model.convert_tokens_to_ids([t.lang_to_token(i)])[0],t._call(e,r)}class Ov extends Ee{constructor(e,r){super(e,r),this.languageRegex=/^[a-z]{3}_[A-Z][a-z]{3}$/,this.language_codes=this.special_tokens.filter(n=>this.languageRegex.test(n)),this.lang_to_token=n=>n}_build_translation_inputs(e,r,n){return pl(this,e,r,n)}}class zv extends Ee{constructor(e,r){super(e,r),this.languageRegex=/^__[a-z]{2,3}__$/,this.language_codes=this.special_tokens.filter(n=>this.languageRegex.test(n)).map(n=>n.slice(2,-2)),this.lang_to_token=n=>`__${n}__`}_build_translation_inputs(e,r,n){return pl(this,e,r,n)}}class Pv extends Ee{_default_chat_template='{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}';get timestamp_begin(){return this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0]+1}_decode_asr(e,{return_timestamps:r=!1,return_language:n=!1,time_precision:a=null,force_full_sequences:i=!0}={}){if(a===null)throw Error("Must specify time_precision");let s=null;const o=r==="word";function l(){return{language:s,timestamp:[null,null],text:""}}const u=[];let p=l(),h=0;const m=this.timestamp_begin;let d=[],_=[],w=!1,v=null;const S=new Set(this.all_special_ids);for(const T of e){const A=T.tokens,P=o?T.token_timestamps:null;let B=null,D=m;if("stride"in T){const[ie,te,de]=T.stride;if(h-=te,v=ie-de,te&&(D=te/a+m),de)for(let se=A.length-1;se>=0;--se){const M=Number(A[se]);if(M>=m){if(B!==null&&(M-m)*a=m){const de=(te-m)*a+h,se=Mi(de,2);if(B!==null&&te>=B)w=!0;else if(w||d.length>0&&te0?(d.push(q),o&&_.push(H)):d.every(ie=>ie.length===0)&&(p=l(),d=[],q=[],_=[],H=[])}if(d.length>0){if(i&&r)throw new Error("Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.");const[T,A]=this.findLongestCommonSequence(d,_),P=this.decode(T);p.text=P,o&&(p.words=this.collateWordTimestamps(T,A,s)),u.push(p)}let $=Object.create(null);const E=u.map(T=>T.text).join("");if(r||n){for(let T=0;T0;let o=s?[]:null,l=s?r[0]:null;for(let u=1;use===ie[M]).length,de=te/T+A;te>1&&de>h&&(h=de,m=[P,B,q,H])}const[_,w,v,S]=m,$=Math.floor((w+_)/2),E=Math.floor((S+v)/2);i.push(...n.slice(0,$)),n=p.slice(E),a=n.length,s&&(o.push(...l.slice(0,$)),l=r[u].slice(E))}return i.push(...n),s?(o.push(...l),[i,o]):[i,[]]}collateWordTimestamps(e,r,n){const[a,i,s]=this.combineTokensIntoWords(e,n),o=[];for(let l=0;l=a){const o=((s-a)*n).toFixed(2);i.push(`<|${o}|>`),i.push([])}else i[i.length-1].push(s);return i=i.map(s=>typeof s=="string"?s:super.decode(s,r)),i.join("")}splitTokensOnUnicode(e){const r=this.decode(e,{decode_with_timestamps:!0}),n="�",a=[],i=[],s=[];let o=[],l=[],u=0;for(let p=0;p=this.model.tokens_to_ids.get("<|endoftext|>"),_=p.startsWith(" "),w=p.trim(),v=l.test(w);if(d||_||v||i.length===0)i.push(p),s.push(h),o.push(m);else{const S=i.length-1;i[S]+=p,s[S].push(...h),o[S].push(...m)}}return[i,s,o]}mergePunctuations(e,r,n,a,i){const s=structuredClone(e),o=structuredClone(r),l=structuredClone(n);let u=s.length-2,p=s.length-1;for(;u>=0;)s[u].startsWith(" ")&&a.includes(s[u].trim())?(s[p]=s[u]+s[p],o[p]=ft(o[u],o[p]),l[p]=ft(l[u],l[p]),s[u]="",o[u]=[],l[u]=[]):p=u,--u;for(u=0,p=1;ph),o.filter(h=>h.length>0),l.filter(h=>h.length>0)]}get_decoder_prompt_ids({language:e=null,task:r=null,no_timestamps:n=!0}={}){const a=[];if(e){const i=_g(e),s=this.model.tokens_to_ids.get(`<|${i}|>`);if(s===void 0)throw new Error(`Unable to find language "${i}" in model vocabulary. Please report this issue at ${po}.`);a.push(s)}else a.push(null);if(r){if(r=r.toLowerCase(),r!=="transcribe"&&r!=="translate")throw new Error(`Task "${r}" is not supported. Must be one of: ["transcribe", "translate"]`);const i=this.model.tokens_to_ids.get(`<|${r}|>`);if(i===void 0)throw new Error(`Unable to find task "${r}" in model vocabulary. Please report this issue at ${po}.`);a.push(i)}else a.push(null);if(n){const i=this.model.tokens_to_ids.get("<|notimestamps|>");if(i===void 0)throw new Error(`Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at ${po}.`);a.push(i)}return a.map((i,s)=>[s+1,i]).filter(i=>i[1]!==null)}}class Rv extends Ee{}class Bv extends Ee{}class Dv extends Ee{}class Nv extends Ee{constructor(e,r){super(e,r),this.languageRegex=/^(>>\w+<<)\s*/g,this.supported_language_codes=this.model.vocab.filter(n=>this.languageRegex.test(n)),console.warn('WARNING: `MarianTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.')}_encode_text(e){if(e===null)return null;const[r,...n]=e.trim().split(this.languageRegex);if(n.length===0)return super._encode_text(r);if(n.length===2){const[a,i]=n;return this.supported_language_codes.includes(a)||console.warn(`Unsupported language code "${a}" detected, which may lead to unexpected behavior. Should be one of: ${JSON.stringify(this.supported_language_codes)}`),ft([a],super._encode_text(i))}}}class Fv extends Ee{}class Ag extends Ee{_default_chat_template="{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}"}class Lv extends Ag{}class Uv extends Ee{}class Wv extends Ee{}class Vv extends Ee{constructor(e,r){super(e,r),this.decoder=new Yb({})}}class Gv extends Ee{}class ht{static TOKENIZER_CLASS_MAPPING={T5Tokenizer:yv,DistilBertTokenizer:fv,CamembertTokenizer:mv,DebertaTokenizer:uv,DebertaV2Tokenizer:dv,BertTokenizer:iv,HerbertTokenizer:cv,ConvBertTokenizer:pv,RoFormerTokenizer:hv,XLMTokenizer:gv,ElectraTokenizer:_v,MobileBertTokenizer:ov,SqueezeBertTokenizer:lv,AlbertTokenizer:sv,GPT2Tokenizer:Eg,BartTokenizer:wv,MBartTokenizer:Cg,MBart50Tokenizer:bv,RobertaTokenizer:vv,WhisperTokenizer:Pv,CodeGenTokenizer:Rv,CLIPTokenizer:Bv,SiglipTokenizer:Dv,MarianTokenizer:Nv,BloomTokenizer:$v,NllbTokenizer:Ov,M2M100Tokenizer:zv,LlamaTokenizer:Tg,CodeLlamaTokenizer:xv,XLMRobertaTokenizer:Sv,MPNetTokenizer:kv,FalconTokenizer:Ev,GPTNeoXTokenizer:Cv,EsmTokenizer:Tv,Wav2Vec2CTCTokenizer:Fv,BlenderbotTokenizer:Ag,BlenderbotSmallTokenizer:Lv,SpeechT5Tokenizer:Uv,NougatTokenizer:Wv,VitsTokenizer:Vv,Qwen2Tokenizer:Av,GemmaTokenizer:Iv,Grok1Tokenizer:Mv,CohereTokenizer:Gv,PreTrainedTokenizer:Ee};static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:i=!1,revision:s="main",legacy:o=null}={}){const[l,u]=await yg(e,{progress_callback:r,config:n,cache_dir:a,local_files_only:i,revision:s,legacy:o}),p=u.tokenizer_class?.replace(/Fast$/,"")??"PreTrainedTokenizer";let h=this.TOKENIZER_CLASS_MAPPING[p];return h||(console.warn(`Unknown tokenizer class "${p}", attempting to construct from base class.`),h=Ee),new h(l,u)}}async function Hv(t,e){return await Br(t,"config.json",!0,e)}function fa(t){const e={};let r={};switch(t.model_type){case"llava":case"paligemma":r=fa(t.text_config);break;case"moondream1":r=fa(t.phi_config);break;case"musicgen":r=fa(t.decoder);break;case"gpt2":case"gptj":case"codegen":case"gpt_bigcode":e.num_heads="n_head",e.num_layers="n_layer",e.hidden_size="n_embd";break;case"gpt_neox":case"stablelm":case"opt":case"phi":case"phi3":case"falcon":e.num_heads="num_attention_heads",e.num_layers="num_hidden_layers",e.hidden_size="hidden_size";break;case"llama":case"cohere":case"mistral":case"starcoder2":case"qwen2":e.num_heads="num_key_value_heads",e.num_layers="num_hidden_layers",e.hidden_size="hidden_size",e.num_attention_heads="num_attention_heads";break;case"gemma":e.num_heads="num_key_value_heads",e.num_layers="num_hidden_layers",e.dim_kv="head_dim";break;case"openelm":e.num_heads="num_kv_heads",e.num_layers="num_transformer_layers",e.dim_kv="head_dim";break;case"gpt_neo":case"donut-swin":e.num_heads="num_heads",e.num_layers="num_layers",e.hidden_size="hidden_size";break;case"bloom":e.num_heads="n_head",e.num_layers="n_layer",e.hidden_size="hidden_size";break;case"mpt":e.num_heads="n_heads",e.num_layers="n_layers",e.hidden_size="d_model";break;case"t5":case"mt5":case"longt5":e.num_decoder_layers="num_decoder_layers",e.num_decoder_heads="num_heads",e.decoder_dim_kv="d_kv",e.num_encoder_layers="num_layers",e.num_encoder_heads="num_heads",e.encoder_dim_kv="d_kv";break;case"bart":case"mbart":case"marian":case"whisper":case"m2m_100":case"blenderbot":case"blenderbot-small":e.num_decoder_layers="decoder_layers",e.num_decoder_heads="decoder_attention_heads",e.decoder_hidden_size="d_model",e.num_encoder_layers="encoder_layers",e.num_encoder_heads="encoder_attention_heads",e.encoder_hidden_size="d_model";break;case"speecht5":e.num_decoder_layers="decoder_layers",e.num_decoder_heads="decoder_attention_heads",e.decoder_hidden_size="hidden_size",e.num_encoder_layers="encoder_layers",e.num_encoder_heads="encoder_attention_heads",e.encoder_hidden_size="hidden_size";break;case"trocr":e.num_encoder_layers=e.num_decoder_layers="decoder_layers",e.num_encoder_heads=e.num_decoder_heads="decoder_attention_heads",e.encoder_hidden_size=e.decoder_hidden_size="d_model";break;case"musicgen_decoder":e.num_encoder_layers=e.num_decoder_layers="num_hidden_layers",e.num_encoder_heads=e.num_decoder_heads="num_attention_heads",e.encoder_hidden_size=e.decoder_hidden_size="hidden_size";break;case"vision-encoder-decoder":const a=fa(t.decoder),i="num_decoder_layers"in a,s=Dr(t,["model_type","is_encoder_decoder"]);return i?(s.num_decoder_layers=a.num_decoder_layers,s.num_decoder_heads=a.num_decoder_heads,s.decoder_hidden_size=a.decoder_hidden_size,s.num_encoder_layers=a.num_encoder_layers,s.num_encoder_heads=a.num_encoder_heads,s.encoder_hidden_size=a.encoder_hidden_size):(s.num_layers=a.num_layers,s.num_heads=a.num_heads,s.hidden_size=a.hidden_size),s}const n={...r,...Dr(t,["model_type","multi_query","is_encoder_decoder"])};for(const a in e)n[a]=t[e[a]];return n}function Ig(t,{prefix:e="past_key_values",encoder_add_pkv:r=!0}={}){const n={},a=t.normalized_config,i=1;if(a.is_encoder_decoder&&r){const s=a.encoder_dim_kv??a.encoder_hidden_size/a.num_encoder_heads,o=a.decoder_dim_kv??a.decoder_hidden_size/a.num_decoder_heads,l=[i,a.num_encoder_heads,0,s],u=[i,a.num_decoder_heads,0,o];for(let p=0;p=1&&s[s.length-1]>=this.timestamp_begin,l=s.length<2||s[s.length-2]>=this.timestamp_begin;if(o&&(l?i.subarray(this.timestamp_begin).fill(-1/0):i.subarray(0,this.eos_token_id).fill(-1/0)),e[n].length===this.begin_index&&this.max_initial_timestamp_index!==null){const m=this.timestamp_begin+this.max_initial_timestamp_index;i.subarray(m+1).fill(-1/0)}const u=U0(i),p=Math.log(u.subarray(this.timestamp_begin).map(Math.exp).reduce((m,d)=>m+d)),h=Kt(u.subarray(0,this.timestamp_begin))[0];p>h&&i.subarray(0,this.timestamp_begin).fill(-1/0)}return r}}class Zv extends yr{constructor(e){super(),this.no_repeat_ngram_size=e}getNgrams(e){const r=e.length,n=[];for(let i=0;i1 to use the classifier free guidance processor, got guidance scale ${e}.`);this.guidance_scale=e}_call(e,r){if(r.dims[0]!==2*e.length)throw new Error(`Logits should have twice the batch size of the input ids, the first half of batches corresponding to the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got batch size ${r.dims[0]} for the logits and ${e.length} for the input ids.`);const n=e.length,a=r.slice([0,n],null),i=r.slice([n,r.dims[0]],null);for(let s=0;s1)throw new Error(`\`top_p\` must be a float > 0 and < 1, but is ${e}`);if(!Number.isInteger(n)||n<1)throw new Error(`\`min_tokens_to_keep\` must be a positive integer, but is ${n}`);this.top_p=e,this.filter_value=r,this.min_tokens_to_keep=n}}class s2 extends hl{constructor(e,{filter_value:r=-1/0,min_tokens_to_keep:n=1}={}){if(super(),!Number.isInteger(e)||e<0)throw new Error(`\`top_k\` must be a positive integer, but is ${e}`);this.top_k=Math.max(e,n),this.filter_value=r}}class Og{max_length=20;max_new_tokens=null;min_length=0;min_new_tokens=null;early_stopping=!1;max_time=null;do_sample=!1;num_beams=1;num_beam_groups=1;penalty_alpha=null;use_cache=!0;temperature=1;top_k=50;top_p=1;typical_p=1;epsilon_cutoff=0;eta_cutoff=0;diversity_penalty=0;repetition_penalty=1;encoder_repetition_penalty=1;length_penalty=1;no_repeat_ngram_size=0;bad_words_ids=null;force_words_ids=null;renormalize_logits=!1;constraints=null;forced_bos_token_id=null;forced_eos_token_id=null;remove_invalid_values=!1;exponential_decay_length_penalty=null;suppress_tokens=null;begin_suppress_tokens=null;forced_decoder_ids=null;guidance_scale=null;num_return_sequences=1;output_attentions=!1;output_hidden_states=!1;output_scores=!1;return_dict_in_generate=!1;pad_token_id=null;bos_token_id=null;eos_token_id=null;encoder_no_repeat_ngram_size=0;decoder_start_token_id=null;generation_kwargs={};constructor(e){Object.assign(this,Dr(e,Object.getOwnPropertyNames(this)))}}class fl extends bt{_call(e,r){throw Error("StoppingCriteria needs to be subclassed")}}class ml extends bt{constructor(){super(),this.criteria=[]}push(e){this.criteria.push(e)}extend(e){e instanceof ml?e=e.criteria:e instanceof fl&&(e=[e]),this.criteria.push(...e)}_call(e,r){const n=new Array(e.length).fill(!1);for(const a of this.criteria){const i=a(e,r);for(let s=0;sr.length>=this.max_length)}}class l2 extends fl{constructor(e){super(),Array.isArray(e)||(e=[e]),this.eos_token_id=e}_call(e,r){return e.map(n=>{const a=n.at(-1);return this.eos_token_id.some(i=>a==i)})}}class ss extends bt{constructor(e){super(),this.generation_config=e}async _call(e){return this.sample(e)}async sample(e){throw Error("sample should be implemented in subclasses.")}getLogits(e,r){let n=e.dims.at(-1),a=e.data;if(r===-1)a=a.slice(-n);else{let i=r*n;a=a.slice(i,i+n)}return a}randomSelect(e){let r=0;for(let a=0;a1)return new c2(e);if(e.num_return_sequences>1)throw Error(`num_return_sequences has to be 1 when doing greedy search, but is ${e.num_return_sequences}.`);return new u2(e)}}class u2 extends ss{async sample(e){const r=Kt(e.data)[1];return[[BigInt(r),0]]}}class d2 extends ss{async sample(e){let r=e.dims.at(-1);this.generation_config.top_k>0&&(r=Math.min(this.generation_config.top_k,r));const[n,a]=await Dn(e,r),i=wt(n.data);return Array.from({length:this.generation_config.num_beams},()=>{const s=this.randomSelect(i);return[a.data[s],Math.log(i[s])]})}}class c2 extends ss{async sample(e){let r=e.dims.at(-1);this.generation_config.top_k>0&&(r=Math.min(this.generation_config.top_k,r));const[n,a]=await Dn(e,r),i=wt(n.data);return Array.from({length:this.generation_config.num_beams},(s,o)=>[a.data[o],Math.log(i[o])])}}class p2 extends Og{return_timestamps=null;return_token_timestamps=null;num_frames=null;alignment_heads=null;task=null;language=null;no_timestamps_token_id=null;prompt_ids=null;is_multilingual=null;lang_to_id=null;task_to_id=null;max_initial_timestamp_index=1}const ve={EncoderOnly:0,EncoderDecoder:1,Seq2Seq:2,Vision2Seq:3,DecoderOnly:4,MaskGeneration:5,ImageTextToText:6,Musicgen:7},es=new Map,zg=new Map,ba=new Map;async function h2(t,e,r){let n=r.device;n&&typeof n!="string"&&(n.hasOwnProperty(e)?n=n[e]:(console.warn(`Device not specified for ${e}. Using the default device.`),n=null));const a=Sw(n);let i=r.dtype;if(typeof i!="string"&&(i&&i.hasOwnProperty(e)?i=i[e]:(i=qv[a[0]],console.warn(`Dtype not specified for ${e}. Using the default dtype: ${i}.`))),xp.hasOwnProperty(i)){if(i===Rt.fp16&&!await jv())throw new Error("The device does not support fp16.")}else throw new Error(`Invalid dtype: ${i}. Should be one of: ${Object.keys(Rt).join(", ")}`);const s=xp[i],o=`${r.subfolder??""}/${e}${s}.onnx`,l={...r.session_options};l.executionProviders??=a;const u=Ii(t,o,!0,r);let p=[];if(r.use_external_data_format){if(an.IS_NODE_ENV)throw new Error("External data format is not yet supported in Node.js");const m=`${e}${s}.onnx_data`,d=`${r.subfolder??""}/${m}`;p.push(new Promise(async(_,w)=>{const v=await Ii(t,d,!0,r);_({path:m,data:v})}))}else l.externalData!==void 0&&(p=l.externalData.map(async m=>{if(typeof m.data=="string"){const d=await Ii(t,m.data,!0,r);return{...m,data:d}}return m}));if(p.length>0&&(l.externalData=await Promise.all(p)),n==="webgpu"){const m=Ig(r.config,{prefix:"present"});if(Object.keys(m).length>0){const d={};for(const _ in m)d[_]="gpu-buffer";l.preferredOutputLocation=d}}return{buffer:await u,session_options:l}}async function Xr(t,e,r){const n=Object.keys(e),a=await Promise.all(n.map(async s=>h2(t,e[s],r))),i={};for(let s=0;s0)throw new Error(`An error occurred during model execution: "Missing the following inputs: ${n.join(", ")}.`);const a=Object.keys(e).length,i=t.inputNames.length;if(a>i){let s=Object.keys(e).filter(o=>!t.inputNames.includes(o));console.warn(`WARNING: Too many inputs were provided (${a} > ${i}). The following inputs will be ignored: "${s.join(", ")}".`)}return r}async function Fr(t,e){const r=f2(t,e);try{const n=Object.fromEntries(Object.entries(r).map(([i,s])=>[i,s.ort_tensor]));let a=await t.run(n);return a=Pg(a),a}catch(n){throw console.error(`An error occurred during model execution: "${n}".`),console.error("Inputs given to model:",r),n}}function Pg(t){for(let e in t)fg(t[e])?t[e]=new ue(t[e]):typeof t[e]=="object"&&Pg(t[e]);return t}function Rg(t){if(t instanceof ue)return t;if(t.length===0)throw Error("items must be non-empty");if(Array.isArray(t[0])){if(t.some(e=>e.length!==t[0].length))throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.");return new ue("int64",BigInt64Array.from(t.flat().map(e=>BigInt(e))),[t.length,t[0].length])}else return new ue("int64",BigInt64Array.from(t.map(e=>BigInt(e))),[1,t.length])}function Bg(t){return new ue("bool",[t],[1])}async function Sp(t,e){let{encoder_outputs:r,past_key_values:n}=e;if(!r){const l=Dr(e,t.sessions.model.inputNames);r=(await Ca(t,l)).last_hidden_state}const{input_ids:a,decoder_input_ids:i,...s}=e;return s.input_ids=i,s.encoder_hidden_states=r,t.sessions.decoder_model_merged.inputNames.includes("encoder_attention_mask")&&(s.encoder_attention_mask=e.attention_mask),await gl(t,s,!0)}async function Ca(t,e){const r=t.sessions.model,n=Object.create(null);for(const a of r.inputNames)n[a]=e[a];return r.inputNames.includes("token_type_ids")&&!n.token_type_ids&&(n.token_type_ids=new ue("int64",new BigInt64Array(n.input_ids.data.length),n.input_ids.dims)),await Fr(r,n)}async function gl(t,e,r=!1){const n=t.sessions[r?"decoder_model_merged":"model"],{past_key_values:a,...i}=e;n.inputNames.includes("use_cache_branch")&&(i.use_cache_branch=Bg(!!a)),n.inputNames.includes("position_ids")&&i.attention_mask&&!i.position_ids&&(i.position_ids=g2(i,a)),t.addPastKeyValues(i,a);const s=Dr(i,n.inputNames);return await Fr(n,s)}async function m2(t,{input_ids:e=null,attention_mask:r=null,pixel_values:n=null,position_ids:a=null,inputs_embeds:i=null,past_key_values:s=null,generation_config:o=null,logits_processor:l=null,...u}){if(!i){if(i=await t.encode_text({input_ids:e}),n&&e.dims[1]!==1){const h=await t.encode_image({pixel_values:n});({inputs_embeds:i,attention_mask:r}=t._merge_input_ids_with_image_features({image_features:h,inputs_embeds:i,input_ids:e,attention_mask:r}))}else if(s&&n&&e.dims[1]===1){const h=e.dims[1],m=Object.values(s)[0].dims.at(-2);r=gr([Ma([e.dims[0],m]),r.slice(null,[r.dims[1]-h,r.dims[1]])],1)}}return await gl(t,{inputs_embeds:i,past_key_values:s,attention_mask:r,position_ids:a,generation_config:o,logits_processor:l},!0)}function g2(t,e=null){const{input_ids:r,inputs_embeds:n,attention_mask:a}=t,[i,s]=a.dims,o=new BigInt64Array(a.data.length);for(let u=0;ui.dims[1])){if(ao==t.config.image_token_index)){const o=t.config.num_image_tokens;if(!o)throw new Error("`num_image_tokens` is missing in the model configuration.");const l=i.dims[1]-(a-o);r.input_ids=i.slice(null,[-l,null]),r.attention_mask=Ma([1,a+l])}}}return r}function _2(t,e,r,n){const{...a}=r;return r.past_key_values&&(e=e.map(s=>[s.at(-1)])),a.decoder_input_ids=Rg(e),a}class Q extends bt{main_input_name="input_ids";forward_params=["input_ids","attention_mask"];constructor(e,r){super(),this.config=e,this.sessions=r;const n=ba.get(this.constructor),a=es.get(n);this.can_generate=!1,this._forward=null,this._prepare_inputs_for_generation=null,a===ve.DecoderOnly?(this.can_generate=!0,this._forward=gl,this._prepare_inputs_for_generation=kp):a===ve.Seq2Seq||a===ve.Vision2Seq||a===ve.Musicgen?(this.can_generate=!0,this._forward=Sp,this._prepare_inputs_for_generation=_2):a===ve.EncoderDecoder?this._forward=Sp:a===ve.ImageTextToText?(this.can_generate=!0,this._forward=m2,this._prepare_inputs_for_generation=kp):this._forward=Ca,this.can_generate&&this.forward_params.push("past_key_values"),this.custom_config=this.config["transformers.js_config"]??{}}async dispose(){const e=[];for(const r of Object.values(this.sessions))r?.handler?.dispose&&e.push(r.handler.dispose());return await Promise.all(e)}static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:i=!1,revision:s="main",model_file_name:o=null,subfolder:l="onnx",device:u=null,dtype:p=null,use_external_data_format:h=null,session_options:m={}}={}){let d={progress_callback:r,config:n,cache_dir:a,local_files_only:i,revision:s,model_file_name:o,subfolder:l,device:u,dtype:p,use_external_data_format:h,session_options:m};const _=ba.get(this),w=es.get(_);d.config=await Mg.from_pretrained(e,d);let v;return w===ve.DecoderOnly?v=await Promise.all([Xr(e,{model:d.model_file_name??"model"},d),Br(e,"generation_config.json",!1,d)]):w===ve.Seq2Seq||w===ve.Vision2Seq?v=await Promise.all([Xr(e,{model:"encoder_model",decoder_model_merged:"decoder_model_merged"},d),Br(e,"generation_config.json",!1,d)]):w===ve.MaskGeneration?v=await Promise.all([Xr(e,{model:"vision_encoder",prompt_encoder_mask_decoder:"prompt_encoder_mask_decoder"},d)]):w===ve.EncoderDecoder?v=await Promise.all([Xr(e,{model:"encoder_model",decoder_model_merged:"decoder_model_merged"},d)]):w===ve.ImageTextToText?v=await Promise.all([Xr(e,{embed_tokens:"embed_tokens",vision_encoder:"vision_encoder",decoder_model_merged:"decoder_model_merged"},d),Br(e,"generation_config.json",!1,d)]):w===ve.Musicgen?v=await Promise.all([Xr(e,{model:"text_encoder",decoder_model_merged:"decoder_model_merged",encodec_decode:"encodec_decode"},d),Br(e,"generation_config.json",!1,d)]):(w!==ve.EncoderOnly&&console.warn(`Model type for '${_??n?.model_type}' not found, assuming encoder-only architecture. Please report this at https://github.com/xenova/transformers.js/issues/new/choose.`),v=await Promise.all([Xr(e,{model:d.model_file_name??"model"},d)])),new this(d.config,...v)}async _call(e){return await this.forward(e)}async forward(e){return await this._forward(this,e)}_get_logits_warper(e){const r=new Ro;return e.temperature!==null&&e.temperature!==1&&r.push(new a2(e.temperature)),e.top_k!==null&&e.top_k!==0&&r.push(new s2(e.top_k)),e.top_p!==null&&e.top_p<1&&r.push(new i2(e.top_p)),r}_get_logits_processor(e,r,n=null){const a=new Ro;if(e.repetition_penalty!==null&&e.repetition_penalty!==1&&a.push(new Jv(e.repetition_penalty)),e.no_repeat_ngram_size!==null&&e.no_repeat_ngram_size>0&&a.push(new Zv(e.no_repeat_ngram_size)),e.bad_words_ids!==null&&a.push(new r2(e.bad_words_ids,e.eos_token_id)),e.min_length!==null&&e.eos_token_id!==null&&e.min_length>0&&a.push(new e2(e.min_length,e.eos_token_id)),e.min_new_tokens!==null&&e.eos_token_id!==null&&e.min_new_tokens>0&&a.push(new t2(r,e.min_new_tokens,e.eos_token_id)),e.forced_bos_token_id!==null&&a.push(new Kv(e.forced_bos_token_id)),e.forced_eos_token_id!==null&&a.push(new Yv(e.max_length,e.forced_eos_token_id)),e.begin_suppress_tokens!==null){const i=r>1||e.forced_bos_token_id===null?r:r+1;a.push(new Xv(e.begin_suppress_tokens,i))}return e.guidance_scale!==null&&e.guidance_scale>1&&a.push(new n2(e.guidance_scale)),n!==null&&a.extend(n),a}_prepare_generation_config(e,r,n=Og){const a={...this.config};for(const s of["decoder","generator","text_config"])s in a&&Object.assign(a,a[s]);const i=new n(a);return"generation_config"in this&&Object.assign(i,this.generation_config),e&&Object.assign(i,e),r&&Object.assign(i,Dr(r,Object.getOwnPropertyNames(i))),i}_get_stopping_criteria(e,r=null){const n=new ml;return e.max_length!==null&&n.push(new o2(e.max_length,this.config.max_position_embeddings??null)),e.eos_token_id!==null&&n.push(new l2(e.eos_token_id)),r&&n.extend(r),n}_validate_model_class(){if(!this.can_generate){const e=[Sl,kl,xl,$l],r=ba.get(this.constructor),n=new Set,a=this.config.model_type;for(const s of e){const o=s.get(a);o&&n.add(o[0])}let i=`The current model class (${r}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;throw n.size>0&&(i+=` Please use the following class instead: ${[...n].join(", ")}`),Error(i)}}prepare_inputs_for_generation(...e){return this._prepare_inputs_for_generation(this,...e)}_update_model_kwargs_for_generation({generated_input_ids:e,outputs:r,model_inputs:n,is_encoder_decoder:a}){return n.past_key_values=this.getPastKeyValues(r,n.past_key_values),n.input_ids=new ue("int64",e.flat(),[e.length,1]),a||(n.attention_mask=gr([n.attention_mask,Ma([n.attention_mask.dims[0],1])],1)),n.position_ids=null,n}_prepare_model_inputs({inputs:e,bos_token_id:r,model_kwargs:n}){const a=Dr(n,this.forward_params),i=this.main_input_name;if(i in a){if(e)throw new Error("`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. Make sure to either pass {inputs} or {input_name}=...")}else a[i]=e;return{inputs_tensor:a[i],model_inputs:a,model_input_name:i}}async _prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:e,model_inputs:r,model_input_name:n,generation_config:a}){const i=Dr(r,this.sessions.model.inputNames);let{last_hidden_state:s}=await Ca(this,i);return a.guidance_scale!==null&&a.guidance_scale>1&&(s=gr([s,Pw(s,0)],0),"attention_mask"in r&&(r.attention_mask=gr([r.attention_mask,Dw(r.attention_mask)],0))),r.encoder_outputs=s,r}_prepare_decoder_input_ids_for_generation({batch_size:e,model_input_name:r,model_kwargs:n,decoder_start_token_id:a,bos_token_id:i,generation_config:s}){let{decoder_input_ids:o,...l}=n;if(!o)if(a??=i,this.config.model_type==="musicgen")o=Array.from({length:e*this.config.decoder.num_codebooks},()=>[a]);else if(Array.isArray(a)){if(a.length!==e)throw new Error(`\`decoder_start_token_id\` expcted to have length ${e} but got ${a.length}`);o=a}else o=Array.from({length:e},()=>[a]);return o=Rg(o),n.decoder_attention_mask=Rw(o),{input_ids:o,model_inputs:l}}async generate({inputs:e=null,generation_config:r=null,logits_processor:n=null,stopping_criteria:a=null,streamer:i=null,...s}){this._validate_model_class(),r=this._prepare_generation_config(r,s);let{inputs_tensor:o,model_inputs:l,model_input_name:u}=this._prepare_model_inputs({inputs:e,model_kwargs:s});const p=this.config.is_encoder_decoder;p&&("encoder_outputs"in l||(l=await this._prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:o,model_inputs:l,model_input_name:u,generation_config:r})));let h;p?{input_ids:h,model_inputs:l}=this._prepare_decoder_input_ids_for_generation({batch_size:l[u].dims.at(0),model_input_name:u,model_kwargs:l,decoder_start_token_id:r.decoder_start_token_id,bos_token_id:r.bos_token_id,generation_config:r}):h=l[u];let m=h.dims.at(-1);r.max_new_tokens!==null&&(r.max_length=m+r.max_new_tokens);const d=this._get_logits_processor(r,m,n),_=this._get_stopping_criteria(r,a),w=l[u].dims.at(0),v=ss.getSampler(r),S=new Array(w).fill(0),$=h.tolist();i&&i.put($);let E=null;for(;;){l=this.prepare_inputs_for_generation($,l,r);const A=await this.forward(l),P=A.logits.slice(null,-1,null),B=d($,P),D=[];for(let H=0;HH)){r.return_dict_in_generate&&(E=this.getPastKeyValues(A,l.past_key_values,!1));break}l=this._update_model_kwargs_for_generation({generated_input_ids:D,outputs:A,model_inputs:l,is_encoder_decoder:p})}i&&i.end();const T=new ue("int64",$.flat(),[$.length,$[0].length]);return r.return_dict_in_generate?{sequences:T,past_key_values:E}:T}addAttentionsToBeam(e,r){if(this.config.is_encoder_decoder){if(!r.cross_attentions||r.cross_attentions.length===0)throw Error("`output_attentions` is true, but the model did not produce cross-attentions. This is most likely because the model was not exported with `output_attentions=True`.");e.cross_attentions||(e.cross_attentions=[]),e.cross_attentions.push(r.cross_attentions)}if(!r.decoder_attentions||r.decoder_attentions.length===0)throw Error("`output_attentions` is true, but the model did not produce decoder-attentions. This is most likely because the model was not exported with `output_attentions=True`.");e.decoder_attentions||(e.decoder_attentions=[]),e.decoder_attentions.push(r.decoder_attentions)}groupBeams(e){const r=Object.create(null);for(const n of e)r[n.id]===void 0?r[n.id]=[n]:r[n.id].push(n);return Object.values(r)}getPastKeyValues(e,r,n=!0){const a=Object.create(null);for(const i in e)if(i.startsWith("present")){let s=i.replace("present","past_key_values");if(r&&i.includes("encoder"))a[s]=r[s];else{if(n&&r){const o=r[s];o.location==="gpu-buffer"&&o.dispose()}a[s]=e[i]}}return a}getAttentions(e){const r=Object.create(null);for(const n of["cross_attentions","decoder_attentions"]){const a=[];for(const i in e)if(i.startsWith(n)){const s=i.split(".").pop();a[s]=e[i]}r[n]=a}return r}addPastKeyValues(e,r){if(r)Object.assign(e,r);else{const n=this.custom_config.kv_cache_dtype??"float32",a=n==="float16"?new Uint16Array:[],i=Ig(this.config);for(const s in i)e[s]=new ue(n,a,i[s])}}}class Yt{}class Pa extends Q{}class y2 extends Pa{}class w2 extends Pa{async _call(e){return new $t(await super._call(e))}}class b2 extends Pa{async _call(e){return new Ae(await super._call(e))}}class v2 extends Pa{async _call(e){return new vt(await super._call(e))}}class $2 extends Pa{async _call(e){return new Ct(await super._call(e))}}class x2 extends Q{}class S2 extends x2{}class Ra extends Q{}class k2 extends Ra{}class E2 extends Ra{async _call(e){return new $t(await super._call(e))}}class C2 extends Ra{async _call(e){return new Ae(await super._call(e))}}class T2 extends Ra{async _call(e){return new vt(await super._call(e))}}class A2 extends Ra{async _call(e){return new Ct(await super._call(e))}}class Ba extends Q{}class I2 extends Ba{}class M2 extends Ba{async _call(e){return new $t(await super._call(e))}}class O2 extends Ba{async _call(e){return new Ae(await super._call(e))}}class z2 extends Ba{async _call(e){return new vt(await super._call(e))}}class P2 extends Ba{async _call(e){return new Ct(await super._call(e))}}class Da extends Q{}class R2 extends Da{}class B2 extends Da{async _call(e){return new $t(await super._call(e))}}class D2 extends Da{async _call(e){return new Ae(await super._call(e))}}class N2 extends Da{async _call(e){return new vt(await super._call(e))}}class F2 extends Da{async _call(e){return new Ct(await super._call(e))}}class Na extends Q{}class L2 extends Na{}class U2 extends Na{async _call(e){return new $t(await super._call(e))}}class W2 extends Na{async _call(e){return new Ae(await super._call(e))}}class V2 extends Na{async _call(e){return new vt(await super._call(e))}}class G2 extends Na{async _call(e){return new Ct(await super._call(e))}}class Fa extends Q{}class H2 extends Fa{}class j2 extends Fa{async _call(e){return new $t(await super._call(e))}}class q2 extends Fa{async _call(e){return new Ae(await super._call(e))}}class K2 extends Fa{async _call(e){return new vt(await super._call(e))}}class Y2 extends Fa{async _call(e){return new Ct(await super._call(e))}}class La extends Q{}class X2 extends La{}class Q2 extends La{async _call(e){return new $t(await super._call(e))}}class Z2 extends La{async _call(e){return new Ae(await super._call(e))}}class J2 extends La{async _call(e){return new vt(await super._call(e))}}class e1 extends La{async _call(e){return new Ct(await super._call(e))}}class Ua extends Q{}class t1 extends Ua{}class r1 extends Ua{async _call(e){return new Ae(await super._call(e))}}class n1 extends Ua{async _call(e){return new vt(await super._call(e))}}class a1 extends Ua{async _call(e){return new Ct(await super._call(e))}}class i1 extends Ua{async _call(e){return new $t(await super._call(e))}}class os extends Q{}class s1 extends os{}class o1 extends os{async _call(e){return new $t(await super._call(e))}}class l1 extends os{async _call(e){return new Ae(await super._call(e))}}class u1 extends os{async _call(e){return new vt(await super._call(e))}}class ls extends Q{}class d1 extends ls{}class c1 extends ls{async _call(e){return new $t(await super._call(e))}}class p1 extends ls{async _call(e){return new Ae(await super._call(e))}}class h1 extends ls{async _call(e){return new Ct(await super._call(e))}}class Wa extends Q{}class f1 extends Wa{}class m1 extends Wa{async _call(e){return new $t(await super._call(e))}}class g1 extends Wa{async _call(e){return new Ae(await super._call(e))}}class _1 extends Wa{async _call(e){return new vt(await super._call(e))}}class y1 extends Wa{async _call(e){return new Ct(await super._call(e))}}class us extends Q{}class w1 extends us{}class b1 extends us{async _call(e){return new $t(await super._call(e))}}class v1 extends us{async _call(e){return new Ae(await super._call(e))}}class $1 extends us{async _call(e){return new Ct(await super._call(e))}}class ds extends Q{}class x1 extends ds{}class S1 extends ds{async _call(e){return new Ae(await super._call(e))}}class k1 extends ds{async _call(e){return new Ct(await super._call(e))}}class E1 extends ds{async _call(e){return new $t(await super._call(e))}}class Dg extends Q{forward_params=["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}}class C1 extends Dg{}class T1 extends Dg{}class Ng extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class A1 extends Ng{}class I1 extends Ng{}class Fg extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class M1 extends Fg{}class O1 extends Fg{}class _l extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class z1 extends _l{}class P1 extends _l{}class R1 extends _l{async _call(e){return new Ae(await super._call(e))}}class cs extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class B1 extends cs{}class D1 extends cs{}class N1 extends cs{async _call(e){return new Ae(await super._call(e))}}class F1 extends cs{}class Lg extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class L1 extends Lg{}class U1 extends Lg{}class Ug extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class W1 extends Ug{}class V1 extends Ug{}class Va extends Q{}class G1 extends Va{}class H1 extends Va{async _call(e){return new $t(await super._call(e))}}class j1 extends Va{async _call(e){return new Ae(await super._call(e))}}class q1 extends Va{async _call(e){return new vt(await super._call(e))}}class K1 extends Va{async _call(e){return new Ct(await super._call(e))}}class Ga extends Q{}class Y1 extends Ga{}class X1 extends Ga{async _call(e){return new $t(await super._call(e))}}class Q1 extends Ga{async _call(e){return new Ae(await super._call(e))}}class Z1 extends Ga{async _call(e){return new vt(await super._call(e))}}class J1 extends Ga{async _call(e){return new Ct(await super._call(e))}}class Ha extends Q{}class e$ extends Ha{}class t$ extends Ha{async _call(e){return new $t(await super._call(e))}}class r$ extends Ha{async _call(e){return new Ae(await super._call(e))}}class n$ extends Ha{async _call(e){return new vt(await super._call(e))}}class a$ extends Ha{async _call(e){return new Ct(await super._call(e))}}class Wg extends Q{}class i$ extends Wg{}class s$ extends Wg{}class Vg extends Q{requires_attention_mask=!1;main_input_name="input_features";forward_params=["input_features","attention_mask","decoder_input_ids","decoder_attention_mask","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}}class o$ extends Vg{}class l$ extends Vg{_prepare_generation_config(e,r){return super._prepare_generation_config(e,r,p2)}_retrieve_init_tokens(e){const r=[e.decoder_start_token_id];let n=e.language;const a=e.task;if(e.is_multilingual){n||(console.warn("No language specified - defaulting to English (en)."),n="en");const s=`<|${_g(n)}|>`;r.push(e.lang_to_id[s]),r.push(e.task_to_id[a??"transcribe"])}else if(n||a)throw new Error("Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.");return!e.return_timestamps&&e.no_timestamps_token_id&&r.at(-1)!==e.no_timestamps_token_id?r.push(e.no_timestamps_token_id):e.return_timestamps&&r.at(-1)===e.no_timestamps_token_id&&(console.warn("<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`."),r.pop()),r.filter(i=>i!=null)}async generate({inputs:e=null,generation_config:r=null,logits_processor:n=null,stopping_criteria:a=null,...i}){r=this._prepare_generation_config(r,i);const s=this._retrieve_init_tokens(r);return r.return_timestamps&&(n??=new Ro,n.push(new Qv(r,s))),await super.generate({inputs:e,generation_config:r,logits_processor:n,decoder_input_ids:s,...i})}_extract_token_timestamps(e,r,n=null,a=.02){if(!e.cross_attentions)throw new Error("Model outputs must contain cross attentions to extract timestamps. This is most likely because the model was not exported with `output_attentions=True`.");let i=this.config.median_filter_width;i===void 0&&(console.warn("Model config has no `median_filter_width`, using default value of 7."),i=7);const s=e.cross_attentions.map(u=>{let p=Array.from({length:this.config.decoder_layers},(v,S)=>gr(u.map($=>$[S]),2)),h=ka(r.map(([v,S])=>n?p[v].slice(null,S,null,[0,n]):p[v].slice(null,S)));h=h.transpose(1,0,2,3);let[m,d]=Iw(h,-2,0,!0),_=h.clone();for(let v=0;v<_.dims[0];++v){let S=_[v];for(let $=0;$h[S+1]-h[S]),_=ft([1],d).map(v=>!!v),w=[];for(let v=0;v<_.length;++v)_[v]&&w.push(m[v]*a);l[u].data.set(w,1)}return l}}class Gg extends Q{main_input_name="pixel_values";forward_params=["pixel_values","input_ids","encoder_hidden_states","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}}class u$ extends Q{forward_params=["input_ids","pixel_values","attention_mask","position_ids","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}}class Hg extends u${async encode_image({pixel_values:e}){const r=(await Fr(this.sessions.vision_encoder,{pixel_values:e})).image_features;return this.config.num_image_tokens||(console.warn(`The number of image tokens was not set in the model configuration. Setting it to the number of features detected by the vision encoder (${r.dims[1]}).`),this.config.num_image_tokens=r.dims[1]),r}async encode_text({input_ids:e}){return(await Fr(this.sessions.embed_tokens,{input_ids:e})).inputs_embeds}_merge_input_ids_with_image_features({inputs_embeds:e,image_features:r,input_ids:n,attention_mask:a}){const i=this.config.image_token_index,o=n.tolist().map(m=>m.findIndex(d=>d==i)),l=o.every(m=>m===-1),u=o.every(m=>m!==-1);if(!l&&!u)throw new Error("Every input should contain either 0 or 1 image token.");if(l)return{inputs_embeds:e,attention_mask:a};const p=[],h=[];for(let m=0;mi*s,1);e.input_labels=new ue("int64",new BigInt64Array(a).fill(1n),n)}const r={image_embeddings:e.image_embeddings,image_positional_embeddings:e.image_positional_embeddings};return e.input_points&&(r.input_points=e.input_points),e.input_labels&&(r.input_labels=e.input_labels),e.input_boxes&&(r.input_boxes=e.input_boxes),await Fr(this.sessions.prompt_encoder_mask_decoder,r)}async _call(e){return new Yx(await super._call(e))}}class Yx extends Yt{constructor({iou_scores:e,pred_masks:r}){super(),this.iou_scores=e,this.pred_masks=r}}class A_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class Xx extends A_{}class Qx extends A_{}class I_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class Zx extends I_{}class Jx extends I_{}class cn extends Q{}class eS extends cn{}class tS extends cn{async _call(e){return new Fn(await super._call(e))}}class rS extends cn{async _call(e){return new Ae(await super._call(e))}}class nS extends cn{async _call(e){return new vt(await super._call(e))}}class wl extends Q{}class aS extends wl{}class iS extends wl{async _call(e){return new Fn(await super._call(e))}}class sS extends wl{async _call(e){return new Ae(await super._call(e))}}class hs extends Q{}class oS extends hs{}class lS extends hs{async _call(e){return new Fn(await super._call(e))}}class uS extends hs{async _call(e){return new Ae(await super._call(e))}}class dS extends hs{async _call(e){return new vt(await super._call(e))}}class bl extends Q{}class cS extends bl{}class pS extends bl{async _call(e){return new Fn(await super._call(e))}}class hS extends bl{async _call(e){return new Ae(await super._call(e))}}class fS extends cn{}class mS extends cn{async _call(e){return new Fn(await super._call(e))}}class gS extends cn{async _call(e){return new Ae(await super._call(e))}}class ja extends Q{}class _S extends ja{}class yS extends ja{async _call(e){return new Fn(await super._call(e))}}class wS extends ja{async _call(e){return new Ae(await super._call(e))}}class bS extends ja{async _call(e){return new E3(await super._call(e))}}class vS extends ja{async _call(e){return new vt(await super._call(e))}}class M_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class $S extends M_{}class xS extends M_{async generate_speech(e,r,{threshold:n=.5,minlenratio:a=0,maxlenratio:i=20,vocoder:s=null}={}){const o={input_ids:e},{encoder_outputs:l,encoder_attention_mask:u}=await Ca(this,o),p=l.dims[1]/this.config.reduction_factor,h=Math.floor(p*i),m=Math.floor(p*a),d=this.config.num_mel_bins;let _=[],w=null,v=null,S=0;for(;;){++S;const T=Bg(!!v);let A;v?A=v.output_sequence_out:A=new ue("float32",new Float32Array(d),[1,1,d]);let P={use_cache_branch:T,output_sequence:A,encoder_attention_mask:u,speaker_embeddings:r,encoder_hidden_states:l};this.addPastKeyValues(P,w),v=await Fr(this.sessions.decoder_model_merged,P),w=this.getPastKeyValues(v,w);const{prob:B,spectrum:D}=v;if(_.push(D),S>=m&&(Array.from(B.data).filter(q=>q>=n).length>0||S>=h))break}const $=gr(_),{waveform:E}=await Fr(s.sessions.model,{spectrogram:$});return{spectrogram:$,waveform:E}}}class SS extends Q{main_input_name="spectrogram"}class kS extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class ES extends kS{}class O_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class CS extends O_{}class TS extends O_{}class z_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class AS extends z_{}class IS extends z_{}class P_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class MS extends P_{}class OS extends P_{}class vl extends Q{}class zS extends vl{}class PS extends vl{static async from_pretrained(e,r={}){return r.model_file_name??="text_model",super.from_pretrained(e,r)}}class RS extends vl{static async from_pretrained(e,r={}){return r.model_file_name??="audio_model",super.from_pretrained(e,r)}}class BS extends Q{}class R_ extends BS{async _call(e){return new T3(await super._call(e))}}class B_ extends Q{}class DS extends B_{}class NS extends B_{}class D_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class FS extends D_{}class LS extends D_{}class N_ extends Q{}class US extends N_{}class WS extends N_{async _call(e){return new Ae(await super._call(e))}}class F_ extends Q{forward_params=["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}_apply_and_filter_by_delay_pattern_mask(e){const[r,n]=e.dims,a=this.config.decoder.num_codebooks,i=n-a;let s=0;for(let u=0;u0&&m<=i&&(e.data[s++]=e.data[u])}const o=Math.floor(r/a),l=s/(o*a);return new ue(e.type,e.data.slice(0,s),[o,a,l])}prepare_inputs_for_generation(e,r,n){let a=structuredClone(e);for(let s=0;s=o&&(a[s][o]=BigInt(this.config.decoder.pad_token_id));return n.guidance_scale!==null&&n.guidance_scale>1&&(a=a.concat(a)),super.prepare_inputs_for_generation(a,r,n)}async generate(e){const r=await super.generate(e),n=this._apply_and_filter_by_delay_pattern_mask(r).unsqueeze_(0),{audio_values:a}=await Fr(this.sessions.encodec_decode,{audio_codes:n});return a}}class L_ extends Q{}class VS extends L_{}class GS extends L_{async _call(e){return new Ae(await super._call(e))}}class U_ extends Q{}class HS extends U_{}class jS extends U_{async _call(e){return new Ae(await super._call(e))}}class W_ extends Q{}class qS extends W_{}class KS extends W_{async _call(e){return new Ae(await super._call(e))}}class V_ extends Q{}class YS extends V_{}class XS extends V_{async _call(e){return new Ae(await super._call(e))}}class et{static MODEL_CLASS_MAPPINGS=null;static BASE_IF_FAIL=!1;static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:i=!1,revision:s="main",model_file_name:o=null,subfolder:l="onnx",device:u=null,dtype:p=null,use_external_data_format:h=null,session_options:m={}}={}){let d={progress_callback:r,config:n,cache_dir:a,local_files_only:i,revision:s,model_file_name:o,subfolder:l,device:u,dtype:p,use_external_data_format:h,session_options:m};if(d.config=await Mg.from_pretrained(e,d),!this.MODEL_CLASS_MAPPINGS)throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: "+this.name);for(let _ of this.MODEL_CLASS_MAPPINGS){const w=_.get(d.config.model_type);if(w)return await w[1].from_pretrained(e,d)}if(this.BASE_IF_FAIL)return console.warn(`Unknown model class "${d.config.model_type}", attempting to construct from base class.`),await Q.from_pretrained(e,d);throw Error(`Unsupported model type: ${d.config.model_type}`)}}const QS=new Map([["bert",["BertModel",y2]],["nomic_bert",["NomicBertModel",S2]],["roformer",["RoFormerModel",k2]],["electra",["ElectraModel",R2]],["esm",["EsmModel",s1]],["convbert",["ConvBertModel",I2]],["camembert",["CamembertModel",L2]],["deberta",["DebertaModel",H2]],["deberta-v2",["DebertaV2Model",X2]],["mpnet",["MPNetModel",f1]],["albert",["AlbertModel",x1]],["distilbert",["DistilBertModel",t1]],["roberta",["RobertaModel",G1]],["xlm",["XLMModel",Y1]],["xlm-roberta",["XLMRobertaModel",e$]],["clap",["ClapModel",zS]],["clip",["CLIPModel",c$]],["clipseg",["CLIPSegModel",w$]],["chinese_clip",["ChineseCLIPModel",y$]],["siglip",["SiglipModel",f$]],["mobilebert",["MobileBertModel",d1]],["squeezebert",["SqueezeBertModel",w1]],["wav2vec2",["Wav2Vec2Model",eS]],["wav2vec2-bert",["Wav2Vec2BertModel",cS]],["unispeech",["UniSpeechModel",aS]],["unispeech-sat",["UniSpeechSatModel",oS]],["hubert",["HubertModel",fS]],["wavlm",["WavLMModel",_S]],["audio-spectrogram-transformer",["ASTModel",i$]],["vits",["VitsModel",R_]],["detr",["DetrModel",mx]],["table-transformer",["TableTransformerModel",wx]],["vit",["ViTModel",J$]],["fastvit",["FastViTModel",tx]],["mobilevit",["MobileViTModel",ix]],["mobilevitv2",["MobileViTV2Model",ox]],["owlvit",["OwlViTModel",ux]],["owlv2",["Owlv2Model",cx]],["beit",["BeitModel",hx]],["deit",["DeiTModel",$x]],["convnext",["ConvNextModel",Nx]],["convnextv2",["ConvNextV2Model",Lx]],["dinov2",["Dinov2Model",Wx]],["resnet",["ResNetModel",Sx]],["swin",["SwinModel",Ex]],["swin2sr",["Swin2SRModel",Tx]],["donut-swin",["DonutSwinModel",Dx]],["yolos",["YolosModel",Gx]],["dpt",["DPTModel",Ix]],["glpn",["GLPNModel",Px]],["hifigan",["SpeechT5HifiGan",SS]],["efficientnet",["EfficientNetModel",US]],["mobilenet_v1",["MobileNetV1Model",VS]],["mobilenet_v2",["MobileNetV2Model",HS]],["mobilenet_v3",["MobileNetV3Model",qS]],["mobilenet_v4",["MobileNetV4Model",YS]]]),ZS=new Map([["t5",["T5Model",C1]],["longt5",["LongT5Model",A1]],["mt5",["MT5Model",M1]],["bart",["BartModel",z1]],["mbart",["MBartModel",B1]],["marian",["MarianModel",Xx]],["whisper",["WhisperModel",o$]],["m2m_100",["M2M100Model",Zx]],["blenderbot",["BlenderbotModel",L1]],["blenderbot-small",["BlenderbotSmallModel",W1]]]),JS=new Map([["bloom",["BloomModel",q$]],["gpt2",["GPT2Model",v$]],["gptj",["GPTJModel",C$]],["gpt_bigcode",["GPTBigCodeModel",A$]],["gpt_neo",["GPTNeoModel",x$]],["gpt_neox",["GPTNeoXModel",k$]],["codegen",["CodeGenModel",M$]],["llama",["LlamaModel",z$]],["cohere",["CohereModel",R$]],["gemma",["GemmaModel",D$]],["openelm",["OpenELMModel",F$]],["qwen2",["Qwen2Model",U$]],["phi",["PhiModel",V$]],["phi3",["Phi3Model",H$]],["mpt",["MptModel",Y$]],["opt",["OPTModel",Q$]],["mistral",["MistralModel",CS]],["starcoder2",["Starcoder2Model",AS]],["falcon",["FalconModel",MS]],["stablelm",["StableLmModel",FS]]]),$l=new Map([["speecht5",["SpeechT5ForSpeechToText",$S]],["whisper",["WhisperForConditionalGeneration",l$]]]),G_=new Map([["speecht5",["SpeechT5ForTextToSpeech",xS]]]),H_=new Map([["vits",["VitsModel",R_]],["musicgen",["MusicgenForConditionalGeneration",F_]]]),j_=new Map([["bert",["BertForSequenceClassification",b2]],["roformer",["RoFormerForSequenceClassification",C2]],["electra",["ElectraForSequenceClassification",D2]],["esm",["EsmForSequenceClassification",l1]],["convbert",["ConvBertForSequenceClassification",O2]],["camembert",["CamembertForSequenceClassification",W2]],["deberta",["DebertaForSequenceClassification",q2]],["deberta-v2",["DebertaV2ForSequenceClassification",Z2]],["mpnet",["MPNetForSequenceClassification",g1]],["albert",["AlbertForSequenceClassification",S1]],["distilbert",["DistilBertForSequenceClassification",r1]],["roberta",["RobertaForSequenceClassification",j1]],["xlm",["XLMForSequenceClassification",Q1]],["xlm-roberta",["XLMRobertaForSequenceClassification",r$]],["bart",["BartForSequenceClassification",R1]],["mbart",["MBartForSequenceClassification",N1]],["mobilebert",["MobileBertForSequenceClassification",p1]],["squeezebert",["SqueezeBertForSequenceClassification",v1]]]),q_=new Map([["bert",["BertForTokenClassification",v2]],["roformer",["RoFormerForTokenClassification",T2]],["electra",["ElectraForTokenClassification",N2]],["esm",["EsmForTokenClassification",u1]],["convbert",["ConvBertForTokenClassification",z2]],["camembert",["CamembertForTokenClassification",V2]],["deberta",["DebertaForTokenClassification",K2]],["deberta-v2",["DebertaV2ForTokenClassification",J2]],["mpnet",["MPNetForTokenClassification",_1]],["distilbert",["DistilBertForTokenClassification",n1]],["roberta",["RobertaForTokenClassification",q1]],["xlm",["XLMForTokenClassification",Z1]],["xlm-roberta",["XLMRobertaForTokenClassification",n$]]]),xl=new Map([["t5",["T5ForConditionalGeneration",T1]],["longt5",["LongT5ForConditionalGeneration",I1]],["mt5",["MT5ForConditionalGeneration",O1]],["bart",["BartForConditionalGeneration",P1]],["mbart",["MBartForConditionalGeneration",D1]],["marian",["MarianMTModel",Qx]],["m2m_100",["M2M100ForConditionalGeneration",Jx]],["blenderbot",["BlenderbotForConditionalGeneration",U1]],["blenderbot-small",["BlenderbotSmallForConditionalGeneration",V1]]]),Sl=new Map([["bloom",["BloomForCausalLM",K$]],["gpt2",["GPT2LMHeadModel",$$]],["gptj",["GPTJForCausalLM",T$]],["gpt_bigcode",["GPTBigCodeForCausalLM",I$]],["gpt_neo",["GPTNeoForCausalLM",S$]],["gpt_neox",["GPTNeoXForCausalLM",E$]],["codegen",["CodeGenForCausalLM",O$]],["llama",["LlamaForCausalLM",P$]],["cohere",["CohereForCausalLM",B$]],["gemma",["GemmaForCausalLM",N$]],["openelm",["OpenELMForCausalLM",L$]],["qwen2",["Qwen2ForCausalLM",W$]],["phi",["PhiForCausalLM",G$]],["phi3",["Phi3ForCausalLM",j$]],["mpt",["MptForCausalLM",X$]],["opt",["OPTForCausalLM",Z$]],["mbart",["MBartForCausalLM",F1]],["mistral",["MistralForCausalLM",TS]],["starcoder2",["Starcoder2ForCausalLM",IS]],["falcon",["FalconForCausalLM",OS]],["trocr",["TrOCRForCausalLM",ES]],["stablelm",["StableLmForCausalLM",LS]]]),K_=new Map([["bert",["BertForMaskedLM",w2]],["roformer",["RoFormerForMaskedLM",E2]],["electra",["ElectraForMaskedLM",B2]],["esm",["EsmForMaskedLM",o1]],["convbert",["ConvBertForMaskedLM",M2]],["camembert",["CamembertForMaskedLM",U2]],["deberta",["DebertaForMaskedLM",j2]],["deberta-v2",["DebertaV2ForMaskedLM",Q2]],["mpnet",["MPNetForMaskedLM",m1]],["albert",["AlbertForMaskedLM",E1]],["distilbert",["DistilBertForMaskedLM",i1]],["roberta",["RobertaForMaskedLM",H1]],["xlm",["XLMWithLMHeadModel",X1]],["xlm-roberta",["XLMRobertaForMaskedLM",t$]],["mobilebert",["MobileBertForMaskedLM",c1]],["squeezebert",["SqueezeBertForMaskedLM",b1]]]),Y_=new Map([["bert",["BertForQuestionAnswering",$2]],["roformer",["RoFormerForQuestionAnswering",A2]],["electra",["ElectraForQuestionAnswering",F2]],["convbert",["ConvBertForQuestionAnswering",P2]],["camembert",["CamembertForQuestionAnswering",G2]],["deberta",["DebertaForQuestionAnswering",Y2]],["deberta-v2",["DebertaV2ForQuestionAnswering",e1]],["mpnet",["MPNetForQuestionAnswering",y1]],["albert",["AlbertForQuestionAnswering",k1]],["distilbert",["DistilBertForQuestionAnswering",a1]],["roberta",["RobertaForQuestionAnswering",K1]],["xlm",["XLMForQuestionAnswering",J1]],["xlm-roberta",["XLMRobertaForQuestionAnswering",a$]],["mobilebert",["MobileBertForQuestionAnswering",h1]],["squeezebert",["SqueezeBertForQuestionAnswering",$1]]]),kl=new Map([["vision-encoder-decoder",["VisionEncoderDecoderModel",Gg]]]),e3=new Map([["llava",["LlavaForConditionalGeneration",Hg]],["moondream1",["Moondream1ForConditionalGeneration",d$]]]),t3=new Map([["vision-encoder-decoder",["VisionEncoderDecoderModel",Gg]]]),X_=new Map([["vit",["ViTForImageClassification",ex]],["fastvit",["FastViTForImageClassification",rx]],["mobilevit",["MobileViTForImageClassification",sx]],["mobilevitv2",["MobileViTV2ForImageClassification",lx]],["beit",["BeitForImageClassification",fx]],["deit",["DeiTForImageClassification",xx]],["convnext",["ConvNextForImageClassification",Fx]],["convnextv2",["ConvNextV2ForImageClassification",Ux]],["dinov2",["Dinov2ForImageClassification",Vx]],["resnet",["ResNetForImageClassification",kx]],["swin",["SwinForImageClassification",Cx]],["segformer",["SegformerForImageClassification",DS]],["efficientnet",["EfficientNetForImageClassification",WS]],["mobilenet_v1",["MobileNetV1ForImageClassification",GS]],["mobilenet_v2",["MobileNetV2ForImageClassification",jS]],["mobilenet_v3",["MobileNetV3ForImageClassification",KS]],["mobilenet_v4",["MobileNetV4ForImageClassification",XS]]]),Q_=new Map([["detr",["DetrForObjectDetection",gx]],["table-transformer",["TableTransformerForObjectDetection",bx]],["yolos",["YolosForObjectDetection",Hx]]]),Z_=new Map([["owlvit",["OwlViTForObjectDetection",dx]],["owlv2",["Owlv2ForObjectDetection",px]]]),J_=new Map([["detr",["DetrForSegmentation",_x]],["clipseg",["CLIPSegForImageSegmentation",b$]]]),e0=new Map([["segformer",["SegformerForSemanticSegmentation",NS]]]),r3=new Map([["sam",["SamModel",Kx]]]),t0=new Map([["wav2vec2",["Wav2Vec2ForCTC",tS]],["wav2vec2-bert",["Wav2Vec2BertForCTC",pS]],["unispeech",["UniSpeechForCTC",iS]],["unispeech-sat",["UniSpeechSatForCTC",lS]],["wavlm",["WavLMForCTC",yS]],["hubert",["HubertForCTC",mS]]]),r0=new Map([["wav2vec2",["Wav2Vec2ForSequenceClassification",rS]],["wav2vec2-bert",["Wav2Vec2BertForSequenceClassification",hS]],["unispeech",["UniSpeechForSequenceClassification",sS]],["unispeech-sat",["UniSpeechSatForSequenceClassification",uS]],["wavlm",["WavLMForSequenceClassification",wS]],["hubert",["HubertForSequenceClassification",gS]],["audio-spectrogram-transformer",["ASTForAudioClassification",s$]]]),n3=new Map([["wavlm",["WavLMForXVector",bS]]]),a3=new Map([["unispeech-sat",["UniSpeechSatForAudioFrameClassification",dS]],["wavlm",["WavLMForAudioFrameClassification",vS]],["wav2vec2",["Wav2Vec2ForAudioFrameClassification",nS]]]),i3=new Map([["vitmatte",["VitMatteForImageMatting",ax]]]),n0=new Map([["swin2sr",["Swin2SRForImageSuperResolution",Ax]]]),a0=new Map([["dpt",["DPTForDepthEstimation",Mx]],["depth_anything",["DepthAnythingForDepthEstimation",zx]],["glpn",["GLPNForDepthEstimation",Rx]]]),i0=new Map([["clip",["CLIPVisionModelWithProjection",h$]],["siglip",["SiglipVisionModel",g$]]]),s0=[[QS,ve.EncoderOnly],[ZS,ve.EncoderDecoder],[JS,ve.DecoderOnly],[j_,ve.EncoderOnly],[q_,ve.EncoderOnly],[xl,ve.Seq2Seq],[$l,ve.Seq2Seq],[Sl,ve.DecoderOnly],[K_,ve.EncoderOnly],[Y_,ve.EncoderOnly],[kl,ve.Vision2Seq],[e3,ve.ImageTextToText],[X_,ve.EncoderOnly],[J_,ve.EncoderOnly],[e0,ve.EncoderOnly],[i3,ve.EncoderOnly],[n0,ve.EncoderOnly],[a0,ve.EncoderOnly],[Q_,ve.EncoderOnly],[Z_,ve.EncoderOnly],[r3,ve.MaskGeneration],[t0,ve.EncoderOnly],[r0,ve.EncoderOnly],[G_,ve.Seq2Seq],[H_,ve.EncoderOnly],[n3,ve.EncoderOnly],[a3,ve.EncoderOnly],[i0,ve.EncoderOnly]];for(const[t,e]of s0)for(const[r,n]of t.values())es.set(r,e),ba.set(n,r),zg.set(r,n);const s3=[["MusicgenForConditionalGeneration",F_,ve.Musicgen],["CLIPTextModelWithProjection",p$,ve.EncoderOnly],["SiglipTextModel",m$,ve.EncoderOnly],["ClapTextModelWithProjection",PS,ve.EncoderOnly],["ClapAudioModelWithProjection",RS,ve.EncoderOnly]];for(const[t,e,r]of s3)es.set(t,r),ba.set(e,t),zg.set(t,e);class ma extends et{static MODEL_CLASS_MAPPINGS=s0.map(e=>e[0]);static BASE_IF_FAIL=!0}class Ep extends et{static MODEL_CLASS_MAPPINGS=[j_]}class o3 extends et{static MODEL_CLASS_MAPPINGS=[q_]}class ho extends et{static MODEL_CLASS_MAPPINGS=[xl]}class l3 extends et{static MODEL_CLASS_MAPPINGS=[$l]}class u3 extends et{static MODEL_CLASS_MAPPINGS=[G_]}class d3 extends et{static MODEL_CLASS_MAPPINGS=[H_]}class c3 extends et{static MODEL_CLASS_MAPPINGS=[Sl]}class p3 extends et{static MODEL_CLASS_MAPPINGS=[K_]}class h3 extends et{static MODEL_CLASS_MAPPINGS=[Y_]}class f3 extends et{static MODEL_CLASS_MAPPINGS=[kl]}class m3 extends et{static MODEL_CLASS_MAPPINGS=[X_]}class g3 extends et{static MODEL_CLASS_MAPPINGS=[J_]}class _3 extends et{static MODEL_CLASS_MAPPINGS=[e0]}class y3 extends et{static MODEL_CLASS_MAPPINGS=[Q_]}class w3 extends et{static MODEL_CLASS_MAPPINGS=[Z_]}class b3 extends et{static MODEL_CLASS_MAPPINGS=[t0]}class v3 extends et{static MODEL_CLASS_MAPPINGS=[r0]}class $3 extends et{static MODEL_CLASS_MAPPINGS=[t3]}class x3 extends et{static MODEL_CLASS_MAPPINGS=[n0]}class S3 extends et{static MODEL_CLASS_MAPPINGS=[a0]}class k3 extends et{static MODEL_CLASS_MAPPINGS=[i0]}class Ae extends Yt{constructor({logits:e}){super(),this.logits=e}}class E3 extends Yt{constructor({logits:e,embeddings:r}){super(),this.logits=e,this.embeddings=r}}class vt extends Yt{constructor({logits:e}){super(),this.logits=e}}class $t extends Yt{constructor({logits:e}){super(),this.logits=e}}class Ct extends Yt{constructor({start_logits:e,end_logits:r}){super(),this.start_logits=e,this.end_logits=r}}class Fn extends Yt{constructor({logits:e}){super(),this.logits=e}}class C3 extends Yt{constructor({alphas:e}){super(),this.alphas=e}}class T3 extends Yt{constructor({waveform:e,spectrogram:r}){super(),this.waveform=e,this.spectrogram=r}}const Gt=typeof self<"u",A3=Gt&&self.constructor.name==="DedicatedWorkerGlobalScope";let Qr,o0,Rr;if(Gt)Qr=(t,e)=>{if(!self.OffscreenCanvas)throw new Error("OffscreenCanvas not supported by this browser.");return new self.OffscreenCanvas(t,e)},Rr=self.createImageBitmap,o0=self.ImageData;else if(Je)Rr=async t=>{const r=(await t.metadata()).channels,{data:n,info:a}=await t.rotate().raw().toBuffer({resolveWithObject:!0}),i=new yt(new Uint8ClampedArray(n),a.width,a.height,a.channels);return r!==void 0&&r!==a.channels&&i.convert(r),i};else throw new Error("Unable to load image processing library.");const I3={0:"nearest",1:"lanczos",2:"bilinear",3:"bicubic",4:"box",5:"hamming"},M3=new Map([["png","image/png"],["jpg","image/jpeg"],["jpeg","image/jpeg"],["gif","image/gif"]]);class yt{constructor(e,r,n,a){this.data=e,this.width=r,this.height=n,this.channels=a}get size(){return[this.width,this.height]}static async read(e){if(e instanceof yt)return e;if(typeof e=="string"||e instanceof URL)return await this.fromURL(e);throw new Error(`Unsupported input type: ${typeof e}`)}static fromCanvas(e){if(!Gt)throw new Error("fromCanvas() is only supported in browser environments.");const n=e.getContext("2d").getImageData(0,0,e.width,e.height).data;return new yt(n,e.width,e.height,4)}static async fromURL(e){const r=await Ui(e);if(r.status!==200)throw new Error(`Unable to read image from "${e}" (${r.status} ${r.statusText})`);const n=await r.blob();return this.fromBlob(n)}static async fromBlob(e){if(Gt){const r=await Rr(e),n=Qr(r.width,r.height).getContext("2d");return n.drawImage(r,0,0),new this(n.getImageData(0,0,r.width,r.height).data,r.width,r.height,4)}else{const r=Je(await e.arrayBuffer());return await Rr(r)}}static fromTensor(e,r="CHW"){if(e.dims.length!==3)throw new Error(`Tensor should have 3 dimensions, but has ${e.dims.length} dimensions.`);if(r==="CHW")e=e.transpose(1,2,0);else if(r!=="HWC")throw new Error(`Unsupported channel format: ${r}`);if(!(e.data instanceof Uint8ClampedArray||e.data instanceof Uint8Array))throw new Error(`Unsupported tensor type: ${e.type}`);switch(e.dims[2]){case 1:case 2:case 3:case 4:return new yt(e.data,e.dims[1],e.dims[0],e.dims[2]);default:throw new Error(`Unsupported number of channels: ${e.dims[2]}`)}}grayscale(){if(this.channels===1)return this;const e=new Uint8ClampedArray(this.width*this.height*1);switch(this.channels){case 3:case 4:for(let r=0,n=0;r=0?l=n:p=-n,a>=0?u=a:h=-a,o.drawImage(s,l,u,e,r,p,h,e,r),new yt(o.getImageData(0,0,e,r).data,e,r,4).convert(i)}else{let i=this.toSharp();if(n>=0&&a>=0)i=i.extract({left:Math.floor(n),top:Math.floor(a),width:e,height:r});else if(n<=0&&a<=0){const s=Math.floor(-a),o=Math.floor(-n);i=i.extend({top:s,left:o,right:e-this.width-o,bottom:r-this.height-s})}else{let s=[0,0],o=0;a<0?(s[0]=Math.floor(-a),s[1]=r-this.height-s[0]):o=Math.floor(a);let l=[0,0],u=0;n<0?(l[0]=Math.floor(-n),l[1]=e-this.width-l[0]):u=Math.floor(n),i=i.extend({top:s[0],bottom:s[1],left:l[0],right:l[1]}).extract({left:u,top:o,width:e,height:r})}return await Rr(i)}}async toBlob(e="image/png",r=1){if(!Gt)throw new Error("toBlob() is only supported in browser environments.");return await this.toCanvas().convertToBlob({type:e,quality:r})}toTensor(e="CHW"){let r=new ue("uint8",new Uint8Array(this.data),[this.height,this.width,this.channels]);if(e!=="HWC")if(e==="CHW")r=r.permute(2,0,1);else throw new Error(`Unsupported channel format: ${e}`);return r}toCanvas(){if(!Gt)throw new Error("toCanvas() is only supported in browser environments.");const e=this.clone().rgba(),r=Qr(e.width,e.height),n=new o0(e.data,e.width,e.height);return r.getContext("2d").putImageData(n,0,0),r}_update(e,r,n,a=null){return this.data=e,this.width=r,this.height=n,a!==null&&(this.channels=a),this}clone(){return new yt(this.data.slice(),this.width,this.height,this.channels)}convert(e){if(this.channels===e)return this;switch(e){case 1:this.grayscale();break;case 3:this.rgb();break;case 4:this.rgba();break;default:throw new Error(`Conversion failed due to unsupported number of channels: ${this.channels}`)}return this}async save(e){if(Gt){if(A3)throw new Error("Unable to save an image from a Web Worker.");const r=e.split(".").pop().toLowerCase(),n=M3.get(r)??"image/png",a=await this.toBlob(n),i=URL.createObjectURL(a),s=document.createElement("a");s.href=i,s.download=e,s.click(),s.remove()}else{if(zt.useFS)return await this.toSharp().toFile(e);throw new Error("Unable to save the image because filesystem is disabled in this environment.")}}toSharp(){if(Gt)throw new Error("toSharp() is only supported in server-side environments.");return Je(this.data,{raw:{width:this.width,height:this.height,channels:this.channels}})}}async function O3(t,e){if(typeof AudioContext>"u")throw Error("Unable to load audio from path/URL since `AudioContext` is not available in your environment. Instead, audio data should be passed directly to the pipeline/processor. For more information and some example code, see https://huggingface.co/docs/transformers.js/guides/node-audio-processing.");const r=await(await Ui(t)).arrayBuffer(),n=new AudioContext({sampleRate:e});typeof e>"u"&&console.warn(`No sampling rate provided, using default of ${n.sampleRate}Hz.`);const a=await n.decodeAudioData(r);let i;if(a.numberOfChannels===2){const s=Math.sqrt(2),o=a.getChannelData(0),l=a.getChannelData(1);i=new Float32Array(o.length);for(let u=0;u2595*Math.log10(1+t/700),kaldi:t=>1127*Math.log(1+t/700),slaney:(t,e=1e3,r=15,n=27/Math.log(6.4))=>t>=e?r+Math.log(t/e)*n:3*t/200};function fo(t,e="htk"){const r=z3[e];if(!r)throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');return typeof t=="number"?r(t):t.map(n=>r(n))}const P3={htk:t=>700*(10**(t/2595)-1),kaldi:t=>700*(Math.exp(t/1127)-1),slaney:(t,e=1e3,r=15,n=Math.log(6.4)/27)=>t>=r?e*Math.exp(n*(t-r)):200*t/3};function R3(t,e="htk"){const r=P3[e];if(!r)throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');return typeof t=="number"?r(t):t.map(n=>r(n))}function B3(t,e){const r=Float64Array.from({length:e.length-1},(s,o)=>e[o+1]-e[o]),n=Array.from({length:t.length},()=>new Array(e.length));for(let s=0;snew Array(t.length));for(let s=0;st+n*i)}function Ta(t,e,r,n,a,i=null,s="htk",o=!1){if(i!==null&&i!=="slaney")throw new Error('norm must be one of null or "slaney"');const l=fo(r,s),u=fo(n,s),p=Tp(l,u,e+2);let h=R3(p,s),m;if(o){const _=a/(t*2);m=fo(Float64Array.from({length:t},(w,v)=>v*_),s),h=p}else m=Tp(0,Math.floor(a/2),t);const d=B3(m,h);if(i!==null&&i==="slaney")for(let _=0;_a)throw Error(`frame_length (${r}) may not be larger than fft_length (${a})`);if(T!==r)throw new Error(`Length of the window (${T}) must equal frame_length (${r})`);if(n<=0)throw new Error("hop_length must be greater than zero");if(i===null&&p!==null)throw new Error("You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram. Specify `power` to fix this issue.");if(s){if(o!=="reflect")throw new Error(`pad_mode="${o}" not implemented yet.`);const R=Math.floor((a-1)/2)+1;t=D3(t,R,R)}const A=Math.floor(1+Math.floor((t.length-r)/n)),P=l?Math.floor(a/2)+1:a;let B=A,D=A;S!==null&&(S>A?$&&(D=S):D=B=S);const q=new V0(a),H=new Float64Array(a),ie=new Float64Array(q.outputBufferSize),te=new Float32Array(P*D);for(let R=0;R=1;--re)H[re]-=u*H[re-1];H[0]*=1-u}for(let re=0;reMath.pow(o,.85));break;default:throw new Error(`Unknown window type ${e}.`)}if(r&&(s=s.subarray(0,t)),n===null)return s;if(t>n)throw new Error(`Length of the window (${t}) may not be larger than frame_length (${n})`);return s}function L3([t,e,r,n]){return[t-r/2,e-n/2,t+r/2,e+n/2]}function El(t,e=.5,r=null,n=!1){const a=t.logits,i=t.pred_boxes,[s,o,l]=a.dims;if(r!==null&&r.length!==s)throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits");let u=[];for(let p=0;pe&&S.push(E)}else{let E=Kt(v.data)[1];if(E===l-1||($=wt(v.data),$[E]A*h[(P+1)%2])),m.boxes.push(T),m.classes.push(E),m.scores.push($[E])}}u.push(m)}return u}function qa(t,e){if(!(t instanceof Float32Array||t instanceof Float64Array))throw new Error(`${e} expects input to be a Float32Array or a Float64Array, but got ${t?.constructor?.name??typeof t} instead. If using the feature extractor directly, remember to use \`read_audio(url, sampling_rate)\` to obtain the raw audio data of the file/url.`)}function Ap(t,e,r=0,n=null){const a=t/e;let i=H0(a)*e;return n!==null&&i>n&&(i=Math.floor(a)*e),ii?u=Math.floor(i*l/a):i>a&&(l=Math.floor(a*u/i)),await e.resize(u,l,{resample:n}))}async crop_margin(e,r=200){const n=e.clone().grayscale(),a=Fp(n.data)[0],s=Kt(n.data)[0]-a;if(s===0)return e;const o=r/255;let l=n.width,u=n.height,p=0,h=0;const m=n.data;for(let d=0;dthis.preprocess(i)));return{pixel_values:ka(n.map(i=>i.pixel_values),0),original_sizes:n.map(i=>i.original_size),reshaped_input_sizes:n.map(i=>i.reshaped_input_size)}}}class U3 extends He{post_process_semantic_segmentation(e,r=null){const n=e.logits,a=n.dims[0];if(r!==null&&r.length!==a)throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits");const i=[];for(let s=0;sm[E]&&(m[E]=$[E],d[E]=S)}const _=new Array(l.dims[0]),w=h.data;for(let S=0;SS!==void 0);i.push({segmentation:h,labels:v})}return i}}class u0 extends He{}class W3 extends u0{}class V3 extends He{}class G3 extends He{}class d0 extends He{}class H3 extends d0{}class j3 extends He{}class q3 extends He{}class c0 extends He{constructor(e){super(e),this.crop_pct=this.config.crop_pct??224/256}async resize(e){const r=this.size?.shortest_edge;if(r===void 0)throw new Error("Size dictionary must contain 'shortest_edge' key.");if(r<384){const n=Math.floor(r/this.crop_pct),[a,i]=this.get_resize_output_image_size(e,{shortest_edge:n});e=await e.resize(a,i,{resample:this.resample}),e=await e.center_crop(r,r)}else e=await e.resize(r,r,{resample:this.resample});return e}}class K3 extends c0{}class Y3 extends He{}class X3 extends He{}class Q3 extends He{constructor(e){super(e),this.include_top=this.config.include_top??!0,this.include_top&&(this.image_std=this.image_std.map(r=>r*r))}}class Z3 extends He{}class J3 extends He{}class ek extends He{}class tk extends He{}class p0 extends He{}class rk extends p0{}class h0 extends He{post_process_object_detection(...e){return El(...e)}}class nk extends h0{}class ak extends He{}class ik extends He{}class f0 extends He{pad_image(e,r,n,a={}){const[i,s,o]=r;let l=this.image_mean;Array.isArray(this.image_mean)||(l=new Array(o).fill(l));let u=this.image_std;Array.isArray(u)||(u=new Array(o).fill(l));const p=l.map((h,m)=>-h/u[m]);return super.pad_image(e,r,n,{center:!0,constant_values:p,...a})}}class sk extends f0{}class ok extends He{async _call(e){const r=await super._call(e),n=[r.pixel_values.dims[0],64,64],a=new ue("int64",new BigInt64Array(n.reduce((i,s)=>i*s)).fill(1n),n);return{...r,pixel_mask:a}}post_process_object_detection(...e){return El(...e)}remove_low_and_no_objects(e,r,n,a){let i=[],s=[],o=[];for(let l=0;ln&&(i.push(p),s.push(d),o.push(h))}return[i,s,o]}check_segment_validity(e,r,n,a=.5,i=.8){let s=[],o=0,l=0;const u=r[n].data;for(let h=0;h=a&&++l;let p=o>0&&l>0;return p&&(p=o/l>i),[p,s]}compute_segments(e,r,n,a,i,s=null,o=null){let[l,u]=o??e[0].dims,p=new ue("int32",new Int32Array(l*u),[l,u]),h=[];if(o!==null)for(let v=0;vd[E]&&(m[E]=v,d[E]=$[E])}let _=0;const w=p.data;for(let v=0;va!==r.dims[i]))throw Error(`The first ${n.length} dimensions of 'input_points' and 'input_labels' must be the same.`);return new ue("int64",e.flat(1/0).map(BigInt),n)}async _call(e,{input_points:r=null,input_labels:n=null,input_boxes:a=null}={}){const i=await super._call(e);if(r&&(i.input_points=this.reshape_input_points(r,i.original_sizes,i.reshaped_input_sizes)),n){if(!i.input_points)throw Error("`input_points` must be provided if `input_labels` are provided.");i.input_labels=this.add_input_labels(n,i.input_points)}return a&&(i.input_boxes=this.reshape_input_points(a,i.original_sizes,i.reshaped_input_sizes,!0)),i}async post_process_masks(e,r,n,{mask_threshold:a=0,binarize:i=!0,pad_size:s=null}={}){const o=[];s=s??this.pad_size;const l=[s.height,s.width];for(let u=0;ua&&(_[w]=1);m=new ue("bool",_,m.dims)}o.push(m)}return o}generate_crop_boxes(e,r,{crop_n_layers:n=0,overlap_ratio:a=512/1500,points_per_crop:i=32,crop_n_points_downscale_factor:s=1}={}){}}class dk extends He{pad_image(e,r,n,a={}){const[i,s,o]=r;return super.pad_image(e,r,{width:s+(n-s%n)%n,height:i+(n-i%n)%n},{mode:"symmetric",center:!1,constant_values:-1,...a})}}class ck extends He{async _call(e,r){Array.isArray(e)||(e=[e]),Array.isArray(r)||(r=[r]);const n=await Promise.all(e.map(s=>this.preprocess(s))),a=await Promise.all(r.map(s=>this.preprocess(s,{do_normalize:!1,do_convert_rgb:!1,do_convert_grayscale:!0})));return{pixel_values:ka(n.map((s,o)=>gr([s.pixel_values,a[o].pixel_values],0)),0),original_sizes:n.map(s=>s.original_size),reshaped_input_sizes:n.map(s=>s.reshaped_input_size)}}}class pk extends pn{constructor(e){super(e),this.config.mel_filters??=Ta(Math.floor(1+this.config.n_fft/2),this.config.feature_size,0,8e3,this.config.sampling_rate,"slaney","slaney"),this.window=ms(this.config.n_fft,"hann")}async _extract_fbank_features(e){const r=await fs(e,this.window,this.config.n_fft,this.config.hop_length,{power:2,mel_filters:this.config.mel_filters,log_mel:"log10",max_num_frames:this.config.nb_max_frames}),n=r.data,a=Kt(n)[0];for(let i=0;ithis.config.n_samples?(console.warn("Attempting to extract features for audio longer than 30 seconds. If using a pipeline to extract transcript from a long audio clip, remember to specify `chunk_length_s` and/or `stride_length_s`."),r=e.slice(0,this.config.n_samples)):(r=new Float32Array(this.config.n_samples),r.set(e)),{input_features:(await this._extract_fbank_features(r)).unsqueeze_(0)}}}class hk extends pn{_zero_mean_unit_var_norm(e){const n=e.reduce((i,s)=>i+s,0)/e.length,a=e.reduce((i,s)=>i+(s-n)**2,0)/e.length;return e.map(i=>(i-n)/Math.sqrt(a+1e-7))}async _call(e){qa(e,"Wav2Vec2FeatureExtractor"),e instanceof Float64Array&&(e=new Float32Array(e));let r=e;this.config.do_normalize&&(r=this._zero_mean_unit_var_norm(r));const n=[1,r.length];return{input_values:new ue("float32",r,n),attention_mask:new ue("int64",new BigInt64Array(r.length).fill(1n),n)}}}class fk extends pn{constructor(e){super(e);const r=this.config.sampling_rate,n=Ta(256,this.config.num_mel_bins,20,Math.floor(r/2),r,null,"kaldi",!0);for(let a=0;an*32768),fs(e,this.window,400,160,{fft_length:512,power:2,center:!1,preemphasis:.97,mel_filters:this.mel_filters,log_mel:"log",mel_floor:1192092955078125e-22,remove_dc_offset:!0,max_num_frames:r,transpose:!0})}async _call(e,{padding:r=!0,pad_to_multiple_of:n=2,do_normalize_per_mel_bins:a=!0,return_attention_mask:i=!0}={}){qa(e,"SeamlessM4TFeatureExtractor");let s=await this._extract_fbank_features(e,this.config.max_length);if(a){const[_,w]=s.dims,v=s.data;for(let S=0;S0){const $=new Float32Array(w*(_+S));$.set(v),$.fill(this.config.padding_value,v.length);const E=_+S;s=new ue(s.type,$,[E,w]),i&&(o=new ue("int64",new BigInt64Array(E),[1,E]),o.data.fill(1n,0,_))}}const[l,u]=s.dims,p=this.config.stride;if(l%p!==0)throw new Error(`The number of frames (${l}) must be a multiple of the stride (${p}).`);const m=s.view(1,Math.floor(l/p),u*p),d={input_features:m};if(i){const _=m.dims[1],w=new BigInt64Array(_);if(o){const v=o.data;for(let S=1,$=0;S0)if(n==="rand_trunc"){const o=Math.floor(Math.random()*(s+1));e=e.subarray(o,o+r),i=await this._extract_fbank_features(e,this.mel_filters_slaney,this.config.nb_max_samples)}else throw new Error(`Truncation strategy "${n}" not implemented`);else{if(s<0){let o=new Float64Array(r);if(o.set(e),a==="repeat")for(let l=e.length;lyt.read(e)))}async function ts(t,e){return Array.isArray(t)||(t=[t]),await Promise.all(t.map(r=>typeof r=="string"||r instanceof URL?O3(r,e):r instanceof Float64Array?new Float32Array(r):r))}function m0(t,e){e&&(t=t.map(s=>s|0));const[r,n,a,i]=t;return{xmin:r,ymin:n,xmax:a,ymax:i}}class tt extends bt{constructor({task:e,model:r,tokenizer:n=null,processor:a=null}){super(),this.task=e,this.model=r,this.tokenizer=n,this.processor=a}async dispose(){await this.model.dispose()}}class xk extends tt{constructor(e){super(e)}async _call(e,{top_k:r=1}={}){const n=this.tokenizer(e,{padding:!0,truncation:!0}),a=await this.model(n),i=this.model.config.problem_type==="multi_label_classification"?l=>l.sigmoid():l=>new ue("float32",wt(l.data),l.dims),s=this.model.config.id2label,o=[];for(const l of a.logits){const u=i(l),p=await Dn(u,r),h=p[0].tolist(),d=p[1].tolist().map((_,w)=>({label:s?s[_]:`LABEL_${_}`,score:h[w]}));r===1?o.push(...d):o.push(d)}return Array.isArray(e)||r===1?o:o[0]}}class Sk extends tt{constructor(e){super(e)}async _call(e,{ignore_labels:r=["O"]}={}){const n=Array.isArray(e),a=this.tokenizer(n?e:[e],{padding:!0,truncation:!0}),s=(await this.model(a)).logits,o=this.model.config.id2label,l=[];for(let u=0;uE==this.tokenizer.sep_token_id);l[h].map((E,T)=>E==1&&(T===0||T>d&&u.findIndex(A=>A==m[T])===-1));const _=i[h].tolist(),w=s[h].tolist();for(let E=1;E<_.length;++E)(l[h]==0||E<=d||u.findIndex(T=>T==m[E])!==-1)&&(_[E]=-1/0,w[E]=-1/0);const v=wt(_).map((E,T)=>[E,T]),S=wt(w).map((E,T)=>[E,T]);v[0][0]=0,S[0][0]=0;const $=P0(v,S).filter(E=>E[0][1]<=E[1][1]).map(E=>[E[0][1],E[1][1],E[0][0]*E[1][0]]).sort((E,T)=>T[2]-E[2]);for(let E=0;E_==this.tokenizer.mask_token_id);if(u===-1)throw Error(`Mask token (${this.tokenizer.mask_token}) not found in text.`);const p=a[o][u],h=await Dn(new ue("float32",wt(p.data),p.dims),r),m=h[0].tolist(),d=h[1].tolist();i.push(d.map((_,w)=>{const v=l.slice();return v[u]=_,{score:m[w],token:Number(_),token_str:this.tokenizer.model.vocab[_],sequence:this.tokenizer.decode(v,{skip_special_tokens:!0})}}))}return Array.isArray(e)?i:i[0]}}class Cl extends tt{_key="generated_text";constructor(e){super(e)}async _call(e,r={}){Array.isArray(e)||(e=[e]),this.model.config.prefix&&(e=e.map(l=>this.model.config.prefix+l));const n=this.model.config.task_specific_params;n&&n[this.task]&&n[this.task].prefix&&(e=e.map(l=>n[this.task].prefix+l));const a=this.tokenizer,i={padding:!0,truncation:!0};let s;this instanceof g0&&"_build_translation_inputs"in a?s=a._build_translation_inputs(e,i,r):s=a(e,i);const o=await this.model.generate({...s,...r});return a.batch_decode(o,{skip_special_tokens:!0}).map(l=>({[this._key]:l}))}}class Ck extends Cl{_key="summary_text";constructor(e){super(e)}}class g0 extends Cl{_key="translation_text";constructor(e){super(e)}}class Tk extends tt{constructor(e){super(e)}async _call(e,r={}){throw new Error("This pipeline is not yet supported in Transformers.js v3.")}}class Ak extends tt{constructor(e){super(e),this.label2id=Object.fromEntries(Object.entries(this.model.config.label2id).map(([r,n])=>[r.toLowerCase(),n])),this.entailment_id=this.label2id.entailment,this.entailment_id===void 0&&(console.warn("Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."),this.entailment_id=2),this.contradiction_id=this.label2id.contradiction??this.label2id.not_entailment,this.contradiction_id===void 0&&(console.warn("Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."),this.contradiction_id=0)}async _call(e,r,{hypothesis_template:n="This example is {}.",multi_label:a=!1}={}){const i=Array.isArray(e);i||(e=[e]),Array.isArray(r)||(r=[r]);const s=r.map(u=>n.replace("{}",u)),o=a||r.length===1,l=[];for(const u of e){const p=[];for(const d of s){const _=this.tokenizer(u,{text_pair:d,padding:!0,truncation:!0}),w=await this.model(_);o?p.push([w.logits.data[this.contradiction_id],w.logits.data[this.entailment_id]]):p.push(w.logits.data[this.entailment_id])}const m=(o?p.map(d=>wt(d)[1]):wt(p)).map((d,_)=>[d,_]).sort((d,_)=>_[0]-d[0]);l.push({sequence:u,labels:m.map(d=>r[d[1]]),scores:m.map(d=>d[0])})}return i?l:l[0]}}class Ik extends tt{constructor(e){super(e)}async _call(e,{pooling:r="none",normalize:n=!1,quantize:a=!1,precision:i="binary"}={}){const s=this.tokenizer(e,{padding:!0,truncation:!0}),o=await this.model(s);let l=o.last_hidden_state??o.logits??o.token_embeddings;if(r!=="none")if(r==="mean")l=Aw(l,s.attention_mask);else if(r==="cls")l=l.slice(null,0);else throw Error(`Pooling method '${r}' not supported.`);return n&&(l=l.normalize(2,-1)),a&&(l=Nw(l,i)),l}}class Mk extends tt{constructor(e){super(e)}async _call(e,{pool:r=null}={}){const n=await Tr(e),{pixel_values:a}=await this.processor(n),i=await this.model({pixel_values:a});let s;if(r){if(!("pooler_output"in i))throw Error("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.");s=i.pooler_output}else s=i.last_hidden_state??i.logits??i.image_embeds;return s}}class Ok extends tt{constructor(e){super(e)}async _call(e,{top_k:r=5}={}){const n=this.processor.feature_extractor.config.sampling_rate,a=await ts(e,n),i=this.model.config.id2label,s=[];for(const o of a){const l=await this.processor(o),p=(await this.model(l)).logits[0],h=await Dn(new ue("float32",wt(p.data),p.dims),r),m=h[0].tolist(),_=h[1].tolist().map((w,v)=>({label:i?i[w]:`LABEL_${w}`,score:m[v]}));s.push(_)}return Array.isArray(e)?s:s[0]}}class zk extends tt{constructor(e){super(e)}async _call(e,r,{hypothesis_template:n="This is a sound of {}."}={}){const a=!Array.isArray(e);a&&(e=[e]);const i=r.map(p=>n.replace("{}",p)),s=this.tokenizer(i,{padding:!0,truncation:!0}),o=this.processor.feature_extractor.config.sampling_rate,l=await ts(e,o),u=[];for(const p of l){const h=await this.processor(p),m=await this.model({...s,...h}),d=wt(m.logits_per_audio.data);u.push([...d].map((_,w)=>({score:_,label:r[w]})))}return a?u[0]:u}}class Pk extends tt{constructor(e){super(e)}async _call(e,r={}){switch(this.model.config.model_type){case"whisper":return this._call_whisper(e,r);case"wav2vec2":case"wav2vec2-bert":case"unispeech":case"unispeech-sat":case"hubert":return this._call_wav2vec2(e,r);default:throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)}}async _call_wav2vec2(e,r){r.language&&console.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".'),r.task&&console.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');const n=!Array.isArray(e);n&&(e=[e]);const a=this.processor.feature_extractor.config.sampling_rate,i=await ts(e,a),s=[];for(const o of i){const l=await this.processor(o),p=(await this.model(l)).logits[0],h=[];for(const d of p)h.push(Kt(d.data)[1]);const m=this.tokenizer.decode(h);s.push({text:m})}return n?s[0]:s}async _call_whisper(e,r){const n=r.return_timestamps??!1,a=r.chunk_length_s??0,i=r.force_full_sequences??!1;let s=r.stride_length_s??null;n==="word"&&(r.return_token_timestamps=!0);const o=!Array.isArray(e);o&&(e=[e]);const l=this.processor.feature_extractor.config.chunk_length/this.model.config.max_source_positions,u=this.processor.feature_extractor.config.hop_length,p=this.processor.feature_extractor.config.sampling_rate,h=await ts(e,p),m=[];for(const d of h){let _=[];if(a>0){if(s===null)s=a/6;else if(a<=s)throw Error("`chunk_length_s` must be larger than `stride_length_s`.");const S=p*a,$=p*s,E=S-2*$;let T=0;for(;T=d.length;_.push({stride:[A.length,B?0:$,D?0:$],input_features:P.input_features,is_last:D}),T+=E}}else _=[{stride:[d.length,0,0],input_features:(await this.processor(d)).input_features,is_last:!0}];for(const S of _){r.num_frames=Math.floor(S.stride[0]/u);const $=await this.model.generate({inputs:S.input_features,...r});n==="word"?(S.tokens=$.sequences[0].tolist(),S.token_timestamps=$.token_timestamps.tolist()[0].map(E=>Mi(E,2))):S.tokens=$[0].tolist(),S.stride=S.stride.map(E=>E/p)}const[w,v]=this.tokenizer._decode_asr(_,{time_precision:l,return_timestamps:n,force_full_sequences:i});m.push({text:w,...v})}return o?m[0]:m}}class Rk extends tt{constructor(e){super(e)}async _call(e,r={}){const n=Array.isArray(e),a=await Tr(e),{pixel_values:i}=await this.processor(a),s=[];for(const o of i){o.dims=[1,...o.dims];const l=await this.model.generate({inputs:o,...r}),u=this.tokenizer.batch_decode(l,{skip_special_tokens:!0}).map(p=>({generated_text:p.trim()}));s.push(u)}return n?s:s[0]}}class Bk extends tt{constructor(e){super(e)}async _call(e,{top_k:r=5}={}){const n=await Tr(e),{pixel_values:a}=await this.processor(n),i=await this.model({pixel_values:a}),s=this.model.config.id2label,o=[];for(const l of i.logits){const u=await Dn(new ue("float32",wt(l.data),l.dims),r),p=u[0].tolist(),m=u[1].tolist().map((d,_)=>({label:s?s[d]:`LABEL_${d}`,score:p[_]}));o.push(m)}return Array.isArray(e)?o:o[0]}}class Dk extends tt{constructor(e){super(e),this.subtasks_mapping={panoptic:"post_process_panoptic_segmentation",instance:"post_process_instance_segmentation",semantic:"post_process_semantic_segmentation"}}async _call(e,{threshold:r=.5,mask_threshold:n=.5,overlap_mask_area_threshold:a=.8,label_ids_to_fuse:i=null,target_sizes:s=null,subtask:o=null}={}){if(Array.isArray(e)&&e.length!==1)throw Error("Image segmentation pipeline currently only supports a batch size of 1.");const u=await Tr(e),p=u.map(S=>[S.height,S.width]),{pixel_values:h,pixel_mask:m}=await this.processor(u),d=await this.model({pixel_values:h,pixel_mask:m});let _=null;if(o!==null)_=this.subtasks_mapping[o];else for(let[S,$]of Object.entries(this.subtasks_mapping))if($ in this.processor.feature_extractor){_=this.processor.feature_extractor[$].bind(this.processor.feature_extractor),o=S;break}const w=this.model.config.id2label,v=[];if(o==="panoptic"||o==="instance"){const S=_(d,r,n,a,i,s??p)[0],$=S.segmentation;for(const E of S.segments_info){const T=new Uint8ClampedArray($.data.length);for(let P=0;P<$.data.length;++P)$.data[P]===E.id&&(T[P]=255);const A=new yt(T,$.dims[1],$.dims[0],1);v.push({score:E.score,label:w[E.label_id],mask:A})}}else if(o==="semantic"){const{segmentation:S,labels:$}=_(d,s??p)[0];for(const E of $){const T=new Uint8ClampedArray(S.data.length);for(let P=0;Pn.replace("{}",m)),o=this.tokenizer(s,{padding:this.model.config.model_type==="siglip"?"max_length":!0,truncation:!0}),{pixel_values:l}=await this.processor(i),u=await this.model({...o,pixel_values:l}),p=this.model.config.model_type==="siglip"?m=>m.sigmoid().data:m=>wt(m.data),h=[];for(const m of u.logits_per_image){const _=[...p(m)].map((w,v)=>({score:w,label:r[v]}));_.sort((w,v)=>v.score-w.score),h.push(_)}return a?h:h[0]}}class Fk extends tt{constructor(e){super(e)}async _call(e,{threshold:r=.9,percentage:n=!1}={}){const a=Array.isArray(e);if(a&&e.length!==1)throw Error("Object detection pipeline currently only supports a batch size of 1.");const i=await Tr(e),s=n?null:i.map(d=>[d.height,d.width]),{pixel_values:o,pixel_mask:l}=await this.processor(i),u=await this.model({pixel_values:o,pixel_mask:l}),p=this.processor.feature_extractor.post_process_object_detection(u,r,s),h=this.model.config.id2label,m=p.map(d=>d.boxes.map((_,w)=>({score:d.scores[w],label:h[d.classes[w]],box:m0(_,!n)})));return a?m:m[0]}}class Lk extends tt{constructor(e){super(e)}async _call(e,r,{threshold:n=.1,top_k:a=null,percentage:i=!1}={}){const s=Array.isArray(e),o=await Tr(e),l=this.tokenizer(r,{padding:!0,truncation:!0}),u=await this.processor(o),p=[];for(let h=0;h({score:v.scores[E],label:r[v.classes[E]],box:m0($,!i)})).sort(($,E)=>E.score-$.score);a!==null&&(S=S.slice(0,a)),p.push(S)}return s?p:p[0]}}class Uk extends tt{constructor(e){super(e)}async _call(e,r,n={}){throw new Error("This pipeline is not yet supported in Transformers.js v3.")}}class Wk extends tt{DEFAULT_VOCODER_ID="Xenova/speecht5_hifigan";constructor(e){super(e),this.vocoder=e.vocoder??null}async _call(e,{speaker_embeddings:r=null}={}){throw new Error("This pipeline is not yet supported in Transformers.js v3.")}async _call_text_to_waveform(e){const r=this.tokenizer(e,{padding:!0,truncation:!0}),{waveform:n}=await this.model(r),a=this.model.config.sampling_rate;return{audio:n.data,sampling_rate:a}}async _call_text_to_spectrogram(e,{speaker_embeddings:r}){if(this.vocoder||(console.log("No vocoder specified, using default HifiGan vocoder."),this.vocoder=await ma.from_pretrained(this.DEFAULT_VOCODER_ID,{dtype:"fp32"})),(typeof r=="string"||r instanceof URL)&&(r=new Float32Array(await(await fetch(r)).arrayBuffer())),r instanceof Float32Array)r=new ue("float32",r,[1,r.length]);else if(!(r instanceof ue))throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.");const{input_ids:n}=this.tokenizer(e,{padding:!0,truncation:!0}),{waveform:a}=await this.model.generate_speech(n,r,{vocoder:this.vocoder}),i=this.processor.feature_extractor.config.sampling_rate;return{audio:a.data,sampling_rate:i}}}class Vk extends tt{constructor(e){super(e)}async _call(e){const r=await Tr(e),n=await this.processor(r),a=await this.model(n),i=[];for(const s of a.reconstruction){const o=s.squeeze().clamp_(0,1).mul_(255).round_().to("uint8");i.push(yt.fromTensor(o))}return i.length>1?i:i[0]}}class Gk extends tt{constructor(e){super(e)}async _call(e){const r=await Tr(e),n=await this.processor(r),{predicted_depth:a}=await this.model(n),i=[];for(let s=0;s1?i:i[0]}}const Ip=Object.freeze({"text-classification":{tokenizer:ht,pipeline:xk,model:Ep,default:{model:"Xenova/distilbert-base-uncased-finetuned-sst-2-english"},type:"text"},"token-classification":{tokenizer:ht,pipeline:Sk,model:o3,default:{model:"Xenova/bert-base-multilingual-cased-ner-hrl"},type:"text"},"question-answering":{tokenizer:ht,pipeline:kk,model:h3,default:{model:"Xenova/distilbert-base-cased-distilled-squad"},type:"text"},"fill-mask":{tokenizer:ht,pipeline:Ek,model:p3,default:{model:"Xenova/bert-base-uncased"},type:"text"},summarization:{tokenizer:ht,pipeline:Ck,model:ho,default:{model:"Xenova/distilbart-cnn-6-6"},type:"text"},translation:{tokenizer:ht,pipeline:g0,model:ho,default:{model:"Xenova/t5-small"},type:"text"},"text2text-generation":{tokenizer:ht,pipeline:Cl,model:ho,default:{model:"Xenova/flan-t5-small"},type:"text"},"text-generation":{tokenizer:ht,pipeline:Tk,model:c3,default:{model:"Xenova/gpt2"},type:"text"},"zero-shot-classification":{tokenizer:ht,pipeline:Ak,model:Ep,default:{model:"Xenova/distilbert-base-uncased-mnli"},type:"text"},"audio-classification":{pipeline:Ok,model:v3,processor:Ot,default:{model:"Xenova/wav2vec2-base-superb-ks"},type:"audio"},"zero-shot-audio-classification":{tokenizer:ht,pipeline:zk,model:ma,processor:Ot,default:{model:"Xenova/clap-htsat-unfused"},type:"multimodal"},"automatic-speech-recognition":{tokenizer:ht,pipeline:Pk,model:[l3,b3],processor:Ot,default:{model:"Xenova/whisper-tiny.en"},type:"multimodal"},"text-to-audio":{tokenizer:ht,pipeline:Wk,model:[d3,u3],processor:[Ot,null],default:{model:"Xenova/speecht5_tts"},type:"text"},"image-to-text":{tokenizer:ht,pipeline:Rk,model:f3,processor:Ot,default:{model:"Xenova/vit-gpt2-image-captioning"},type:"multimodal"},"image-classification":{pipeline:Bk,model:m3,processor:Ot,default:{model:"Xenova/vit-base-patch16-224"},type:"multimodal"},"image-segmentation":{pipeline:Dk,model:[g3,_3],processor:Ot,default:{model:"Xenova/detr-resnet-50-panoptic"},type:"multimodal"},"zero-shot-image-classification":{tokenizer:ht,pipeline:Nk,model:ma,processor:Ot,default:{model:"Xenova/clip-vit-base-patch32"},type:"multimodal"},"object-detection":{pipeline:Fk,model:y3,processor:Ot,default:{model:"Xenova/detr-resnet-50"},type:"multimodal"},"zero-shot-object-detection":{tokenizer:ht,pipeline:Lk,model:w3,processor:Ot,default:{model:"Xenova/owlvit-base-patch32"},type:"multimodal"},"document-question-answering":{tokenizer:ht,pipeline:Uk,model:$3,processor:Ot,default:{model:"Xenova/donut-base-finetuned-docvqa"},type:"multimodal"},"image-to-image":{pipeline:Vk,model:x3,processor:Ot,default:{model:"Xenova/swin2SR-classical-sr-x2-64"},type:"image"},"depth-estimation":{pipeline:Gk,model:S3,processor:Ot,default:{model:"Xenova/dpt-large"},type:"image"},"feature-extraction":{tokenizer:ht,pipeline:Ik,model:ma,default:{model:"Xenova/all-MiniLM-L6-v2"},type:"text"},"image-feature-extraction":{processor:Ot,pipeline:Mk,model:[k3,ma],default:{model:"Xenova/vit-base-patch16-224-in21k"},type:"image"}}),Hk=Object.freeze({"sentiment-analysis":"text-classification",ner:"token-classification",asr:"automatic-speech-recognition","text-to-speech":"text-to-audio",embeddings:"feature-extraction"});async function jk(t,e=null,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:i=!1,revision:s="main",device:o=null,dtype:l=null,model_file_name:u=null,session_options:p={}}={}){t=Hk[t]??t;const h=Ip[t.split("_",1)[0]];if(!h)throw Error(`Unsupported pipeline: ${t}. Must be one of [${Object.keys(Ip)}]`);e||(e=h.default.model,console.log(`No model specified. Using default model: "${e}".`));const m={progress_callback:r,config:n,cache_dir:a,local_files_only:i,revision:s,device:o,dtype:l,model_file_name:u,session_options:p},d=new Map([["tokenizer",h.tokenizer],["model",h.model],["processor",h.processor]]),_=await qk(d,e,m);_.task=t,An(r,{status:"ready",task:t,model:e});const w=h.pipeline;return new w(_)}async function qk(t,e,r){const n=Object.create(null),a=[];for(let[i,s]of t.entries()){if(!s)continue;let o;Array.isArray(s)?o=new Promise(async(l,u)=>{let p;for(let h of s){if(h===null){l(null);return}try{l(await h.from_pretrained(e,r));return}catch(m){if(m.message?.includes("Unsupported model type"))p=m;else{u(m);return}}}u(p)}):o=s.from_pretrained(e,r),n[i]=o,a.push(o)}await Promise.all(a);for(let[i,s]of Object.entries(n))n[i]=await s;return n}an.IS_PROCESS_AVAILABLE;const gs=document.getElementById("status"),Mp=document.getElementById("container"),ga=document.getElementById("video"),go=document.getElementById("overlay");gs.textContent="Loading model (8 MB)...";const Kk="onnx-community/mobilenetv4_conv_small.e2400_r224_in1k";let _0;try{_0=await jk("image-classification",Kk,{dtype:"fp32"})}catch(t){throw gs.textContent=t.message,alert(t.message),t}gs.textContent="Ready";const Yk=.1,Zr=256,Bo=document.createElement("canvas");Bo.width=Bo.height=Zr;const Op=Bo.getContext("2d",{willReadFrequently:!0});let _o=!1,yo;function y0(){_o||(_o=!0,async function(){Op.drawImage(ga,0,0,Zr,Zr);const t=Op.getImageData(0,0,Zr,Zr).data,e=new yt(t,Zr,Zr,4),r=await _0(e,{top_k:null});go.innerHTML="";for(const{label:n,score:a}of r){if(a{ga.srcObject=t,ga.play();const e=t.getVideoTracks()[0],{width:r,height:n}=e.getSettings();ga.width=r,ga.height=n;const a=r/n,[i,s]=a>720/405?[720,720/a]:[405*a,405];Mp.style.width=`${i}px`,Mp.style.height=`${s}px`,window.requestAnimationFrame(y0)}).catch(t=>{alert(t)}); +'}}{% endif %}`}class Mv extends Ee{}function pl(t,e,r,n){if(!("language_codes"in t)||!Array.isArray(t.language_codes))throw new Error("Tokenizer must have `language_codes` attribute set and it should be an array of language ids.");if(!("languageRegex"in t)||!(t.languageRegex instanceof RegExp))throw new Error("Tokenizer must have `languageRegex` attribute set and it should be a regular expression.");if(!("lang_to_token"in t)||typeof t.lang_to_token!="function")throw new Error("Tokenizer must have `lang_to_token` attribute set and it should be a function.");const a=n.src_lang,i=n.tgt_lang;if(!t.language_codes.includes(i))throw new Error(`Target language code "${i}" is not valid. Must be one of: {${t.language_codes.join(", ")}}`);if(a!==void 0){if(!t.language_codes.includes(a))throw new Error(`Source language code "${a}" is not valid. Must be one of: {${t.language_codes.join(", ")}}`);for(const s of t.post_processor.config.single)if("SpecialToken"in s&&t.languageRegex.test(s.SpecialToken.id)){s.SpecialToken.id=t.lang_to_token(a);break}}return n.forced_bos_token_id=t.model.convert_tokens_to_ids([t.lang_to_token(i)])[0],t._call(e,r)}class Ov extends Ee{constructor(e,r){super(e,r),this.languageRegex=/^[a-z]{3}_[A-Z][a-z]{3}$/,this.language_codes=this.special_tokens.filter(n=>this.languageRegex.test(n)),this.lang_to_token=n=>n}_build_translation_inputs(e,r,n){return pl(this,e,r,n)}}class zv extends Ee{constructor(e,r){super(e,r),this.languageRegex=/^__[a-z]{2,3}__$/,this.language_codes=this.special_tokens.filter(n=>this.languageRegex.test(n)).map(n=>n.slice(2,-2)),this.lang_to_token=n=>`__${n}__`}_build_translation_inputs(e,r,n){return pl(this,e,r,n)}}class Pv extends Ee{_default_chat_template='{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}';get timestamp_begin(){return this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0]+1}_decode_asr(e,{return_timestamps:r=!1,return_language:n=!1,time_precision:a=null,force_full_sequences:i=!0}={}){if(a===null)throw Error("Must specify time_precision");let s=null;const o=r==="word";function l(){return{language:s,timestamp:[null,null],text:""}}const u=[];let p=l(),h=0;const m=this.timestamp_begin;let d=[],_=[],w=!1,v=null;const S=new Set(this.all_special_ids);for(const T of e){const A=T.tokens,P=o?T.token_timestamps:null;let B=null,D=m;if("stride"in T){const[ie,te,de]=T.stride;if(h-=te,v=ie-de,te&&(D=te/a+m),de)for(let se=A.length-1;se>=0;--se){const M=Number(A[se]);if(M>=m){if(B!==null&&(M-m)*a=m){const de=(te-m)*a+h,se=Mi(de,2);if(B!==null&&te>=B)w=!0;else if(w||d.length>0&&te0?(d.push(q),o&&_.push(H)):d.every(ie=>ie.length===0)&&(p=l(),d=[],q=[],_=[],H=[])}if(d.length>0){if(i&&r)throw new Error("Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.");const[T,A]=this.findLongestCommonSequence(d,_),P=this.decode(T);p.text=P,o&&(p.words=this.collateWordTimestamps(T,A,s)),u.push(p)}let $=Object.create(null);const E=u.map(T=>T.text).join("");if(r||n){for(let T=0;T0;let o=s?[]:null,l=s?r[0]:null;for(let u=1;use===ie[M]).length,de=te/T+A;te>1&&de>h&&(h=de,m=[P,B,q,H])}const[_,w,v,S]=m,$=Math.floor((w+_)/2),E=Math.floor((S+v)/2);i.push(...n.slice(0,$)),n=p.slice(E),a=n.length,s&&(o.push(...l.slice(0,$)),l=r[u].slice(E))}return i.push(...n),s?(o.push(...l),[i,o]):[i,[]]}collateWordTimestamps(e,r,n){const[a,i,s]=this.combineTokensIntoWords(e,n),o=[];for(let l=0;l=a){const o=((s-a)*n).toFixed(2);i.push(`<|${o}|>`),i.push([])}else i[i.length-1].push(s);return i=i.map(s=>typeof s=="string"?s:super.decode(s,r)),i.join("")}splitTokensOnUnicode(e){const r=this.decode(e,{decode_with_timestamps:!0}),n="�",a=[],i=[],s=[];let o=[],l=[],u=0;for(let p=0;p=this.model.tokens_to_ids.get("<|endoftext|>"),_=p.startsWith(" "),w=p.trim(),v=l.test(w);if(d||_||v||i.length===0)i.push(p),s.push(h),o.push(m);else{const S=i.length-1;i[S]+=p,s[S].push(...h),o[S].push(...m)}}return[i,s,o]}mergePunctuations(e,r,n,a,i){const s=structuredClone(e),o=structuredClone(r),l=structuredClone(n);let u=s.length-2,p=s.length-1;for(;u>=0;)s[u].startsWith(" ")&&a.includes(s[u].trim())?(s[p]=s[u]+s[p],o[p]=ft(o[u],o[p]),l[p]=ft(l[u],l[p]),s[u]="",o[u]=[],l[u]=[]):p=u,--u;for(u=0,p=1;ph),o.filter(h=>h.length>0),l.filter(h=>h.length>0)]}get_decoder_prompt_ids({language:e=null,task:r=null,no_timestamps:n=!0}={}){const a=[];if(e){const i=_g(e),s=this.model.tokens_to_ids.get(`<|${i}|>`);if(s===void 0)throw new Error(`Unable to find language "${i}" in model vocabulary. Please report this issue at ${po}.`);a.push(s)}else a.push(null);if(r){if(r=r.toLowerCase(),r!=="transcribe"&&r!=="translate")throw new Error(`Task "${r}" is not supported. Must be one of: ["transcribe", "translate"]`);const i=this.model.tokens_to_ids.get(`<|${r}|>`);if(i===void 0)throw new Error(`Unable to find task "${r}" in model vocabulary. Please report this issue at ${po}.`);a.push(i)}else a.push(null);if(n){const i=this.model.tokens_to_ids.get("<|notimestamps|>");if(i===void 0)throw new Error(`Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at ${po}.`);a.push(i)}return a.map((i,s)=>[s+1,i]).filter(i=>i[1]!==null)}}class Rv extends Ee{}class Bv extends Ee{}class Dv extends Ee{}class Nv extends Ee{constructor(e,r){super(e,r),this.languageRegex=/^(>>\w+<<)\s*/g,this.supported_language_codes=this.model.vocab.filter(n=>this.languageRegex.test(n)),console.warn('WARNING: `MarianTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.')}_encode_text(e){if(e===null)return null;const[r,...n]=e.trim().split(this.languageRegex);if(n.length===0)return super._encode_text(r);if(n.length===2){const[a,i]=n;return this.supported_language_codes.includes(a)||console.warn(`Unsupported language code "${a}" detected, which may lead to unexpected behavior. Should be one of: ${JSON.stringify(this.supported_language_codes)}`),ft([a],super._encode_text(i))}}}class Fv extends Ee{}class Ag extends Ee{_default_chat_template="{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}"}class Lv extends Ag{}class Uv extends Ee{}class Wv extends Ee{}class Vv extends Ee{constructor(e,r){super(e,r),this.decoder=new Yb({})}}class Gv extends Ee{}class ht{static TOKENIZER_CLASS_MAPPING={T5Tokenizer:yv,DistilBertTokenizer:fv,CamembertTokenizer:mv,DebertaTokenizer:uv,DebertaV2Tokenizer:dv,BertTokenizer:iv,HerbertTokenizer:cv,ConvBertTokenizer:pv,RoFormerTokenizer:hv,XLMTokenizer:gv,ElectraTokenizer:_v,MobileBertTokenizer:ov,SqueezeBertTokenizer:lv,AlbertTokenizer:sv,GPT2Tokenizer:Eg,BartTokenizer:wv,MBartTokenizer:Cg,MBart50Tokenizer:bv,RobertaTokenizer:vv,WhisperTokenizer:Pv,CodeGenTokenizer:Rv,CLIPTokenizer:Bv,SiglipTokenizer:Dv,MarianTokenizer:Nv,BloomTokenizer:$v,NllbTokenizer:Ov,M2M100Tokenizer:zv,LlamaTokenizer:Tg,CodeLlamaTokenizer:xv,XLMRobertaTokenizer:Sv,MPNetTokenizer:kv,FalconTokenizer:Ev,GPTNeoXTokenizer:Cv,EsmTokenizer:Tv,Wav2Vec2CTCTokenizer:Fv,BlenderbotTokenizer:Ag,BlenderbotSmallTokenizer:Lv,SpeechT5Tokenizer:Uv,NougatTokenizer:Wv,VitsTokenizer:Vv,Qwen2Tokenizer:Av,GemmaTokenizer:Iv,Grok1Tokenizer:Mv,CohereTokenizer:Gv,PreTrainedTokenizer:Ee};static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:i=!1,revision:s="main",legacy:o=null}={}){const[l,u]=await yg(e,{progress_callback:r,config:n,cache_dir:a,local_files_only:i,revision:s,legacy:o}),p=u.tokenizer_class?.replace(/Fast$/,"")??"PreTrainedTokenizer";let h=this.TOKENIZER_CLASS_MAPPING[p];return h||(console.warn(`Unknown tokenizer class "${p}", attempting to construct from base class.`),h=Ee),new h(l,u)}}async function Hv(t,e){return await Br(t,"config.json",!0,e)}function fa(t){const e={};let r={};switch(t.model_type){case"llava":case"paligemma":r=fa(t.text_config);break;case"moondream1":r=fa(t.phi_config);break;case"musicgen":r=fa(t.decoder);break;case"gpt2":case"gptj":case"codegen":case"gpt_bigcode":e.num_heads="n_head",e.num_layers="n_layer",e.hidden_size="n_embd";break;case"gpt_neox":case"stablelm":case"opt":case"phi":case"phi3":case"falcon":e.num_heads="num_attention_heads",e.num_layers="num_hidden_layers",e.hidden_size="hidden_size";break;case"llama":case"cohere":case"mistral":case"starcoder2":case"qwen2":e.num_heads="num_key_value_heads",e.num_layers="num_hidden_layers",e.hidden_size="hidden_size",e.num_attention_heads="num_attention_heads";break;case"gemma":e.num_heads="num_key_value_heads",e.num_layers="num_hidden_layers",e.dim_kv="head_dim";break;case"openelm":e.num_heads="num_kv_heads",e.num_layers="num_transformer_layers",e.dim_kv="head_dim";break;case"gpt_neo":case"donut-swin":e.num_heads="num_heads",e.num_layers="num_layers",e.hidden_size="hidden_size";break;case"bloom":e.num_heads="n_head",e.num_layers="n_layer",e.hidden_size="hidden_size";break;case"mpt":e.num_heads="n_heads",e.num_layers="n_layers",e.hidden_size="d_model";break;case"t5":case"mt5":case"longt5":e.num_decoder_layers="num_decoder_layers",e.num_decoder_heads="num_heads",e.decoder_dim_kv="d_kv",e.num_encoder_layers="num_layers",e.num_encoder_heads="num_heads",e.encoder_dim_kv="d_kv";break;case"bart":case"mbart":case"marian":case"whisper":case"m2m_100":case"blenderbot":case"blenderbot-small":e.num_decoder_layers="decoder_layers",e.num_decoder_heads="decoder_attention_heads",e.decoder_hidden_size="d_model",e.num_encoder_layers="encoder_layers",e.num_encoder_heads="encoder_attention_heads",e.encoder_hidden_size="d_model";break;case"speecht5":e.num_decoder_layers="decoder_layers",e.num_decoder_heads="decoder_attention_heads",e.decoder_hidden_size="hidden_size",e.num_encoder_layers="encoder_layers",e.num_encoder_heads="encoder_attention_heads",e.encoder_hidden_size="hidden_size";break;case"trocr":e.num_encoder_layers=e.num_decoder_layers="decoder_layers",e.num_encoder_heads=e.num_decoder_heads="decoder_attention_heads",e.encoder_hidden_size=e.decoder_hidden_size="d_model";break;case"musicgen_decoder":e.num_encoder_layers=e.num_decoder_layers="num_hidden_layers",e.num_encoder_heads=e.num_decoder_heads="num_attention_heads",e.encoder_hidden_size=e.decoder_hidden_size="hidden_size";break;case"vision-encoder-decoder":const a=fa(t.decoder),i="num_decoder_layers"in a,s=Dr(t,["model_type","is_encoder_decoder"]);return i?(s.num_decoder_layers=a.num_decoder_layers,s.num_decoder_heads=a.num_decoder_heads,s.decoder_hidden_size=a.decoder_hidden_size,s.num_encoder_layers=a.num_encoder_layers,s.num_encoder_heads=a.num_encoder_heads,s.encoder_hidden_size=a.encoder_hidden_size):(s.num_layers=a.num_layers,s.num_heads=a.num_heads,s.hidden_size=a.hidden_size),s}const n={...r,...Dr(t,["model_type","multi_query","is_encoder_decoder"])};for(const a in e)n[a]=t[e[a]];return n}function Ig(t,{prefix:e="past_key_values",encoder_add_pkv:r=!0}={}){const n={},a=t.normalized_config,i=1;if(a.is_encoder_decoder&&r){const s=a.encoder_dim_kv??a.encoder_hidden_size/a.num_encoder_heads,o=a.decoder_dim_kv??a.decoder_hidden_size/a.num_decoder_heads,l=[i,a.num_encoder_heads,0,s],u=[i,a.num_decoder_heads,0,o];for(let p=0;p=1&&s[s.length-1]>=this.timestamp_begin,l=s.length<2||s[s.length-2]>=this.timestamp_begin;if(o&&(l?i.subarray(this.timestamp_begin).fill(-1/0):i.subarray(0,this.eos_token_id).fill(-1/0)),e[n].length===this.begin_index&&this.max_initial_timestamp_index!==null){const m=this.timestamp_begin+this.max_initial_timestamp_index;i.subarray(m+1).fill(-1/0)}const u=U0(i),p=Math.log(u.subarray(this.timestamp_begin).map(Math.exp).reduce((m,d)=>m+d)),h=Kt(u.subarray(0,this.timestamp_begin))[0];p>h&&i.subarray(0,this.timestamp_begin).fill(-1/0)}return r}}class Zv extends yr{constructor(e){super(),this.no_repeat_ngram_size=e}getNgrams(e){const r=e.length,n=[];for(let i=0;i1 to use the classifier free guidance processor, got guidance scale ${e}.`);this.guidance_scale=e}_call(e,r){if(r.dims[0]!==2*e.length)throw new Error(`Logits should have twice the batch size of the input ids, the first half of batches corresponding to the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got batch size ${r.dims[0]} for the logits and ${e.length} for the input ids.`);const n=e.length,a=r.slice([0,n],null),i=r.slice([n,r.dims[0]],null);for(let s=0;s1)throw new Error(`\`top_p\` must be a float > 0 and < 1, but is ${e}`);if(!Number.isInteger(n)||n<1)throw new Error(`\`min_tokens_to_keep\` must be a positive integer, but is ${n}`);this.top_p=e,this.filter_value=r,this.min_tokens_to_keep=n}}class s2 extends hl{constructor(e,{filter_value:r=-1/0,min_tokens_to_keep:n=1}={}){if(super(),!Number.isInteger(e)||e<0)throw new Error(`\`top_k\` must be a positive integer, but is ${e}`);this.top_k=Math.max(e,n),this.filter_value=r}}class Og{max_length=20;max_new_tokens=null;min_length=0;min_new_tokens=null;early_stopping=!1;max_time=null;do_sample=!1;num_beams=1;num_beam_groups=1;penalty_alpha=null;use_cache=!0;temperature=1;top_k=50;top_p=1;typical_p=1;epsilon_cutoff=0;eta_cutoff=0;diversity_penalty=0;repetition_penalty=1;encoder_repetition_penalty=1;length_penalty=1;no_repeat_ngram_size=0;bad_words_ids=null;force_words_ids=null;renormalize_logits=!1;constraints=null;forced_bos_token_id=null;forced_eos_token_id=null;remove_invalid_values=!1;exponential_decay_length_penalty=null;suppress_tokens=null;begin_suppress_tokens=null;forced_decoder_ids=null;guidance_scale=null;num_return_sequences=1;output_attentions=!1;output_hidden_states=!1;output_scores=!1;return_dict_in_generate=!1;pad_token_id=null;bos_token_id=null;eos_token_id=null;encoder_no_repeat_ngram_size=0;decoder_start_token_id=null;generation_kwargs={};constructor(e){Object.assign(this,Dr(e,Object.getOwnPropertyNames(this)))}}class fl extends bt{_call(e,r){throw Error("StoppingCriteria needs to be subclassed")}}class ml extends bt{constructor(){super(),this.criteria=[]}push(e){this.criteria.push(e)}extend(e){e instanceof ml?e=e.criteria:e instanceof fl&&(e=[e]),this.criteria.push(...e)}_call(e,r){const n=new Array(e.length).fill(!1);for(const a of this.criteria){const i=a(e,r);for(let s=0;sr.length>=this.max_length)}}class l2 extends fl{constructor(e){super(),Array.isArray(e)||(e=[e]),this.eos_token_id=e}_call(e,r){return e.map(n=>{const a=n.at(-1);return this.eos_token_id.some(i=>a==i)})}}class ss extends bt{constructor(e){super(),this.generation_config=e}async _call(e){return this.sample(e)}async sample(e){throw Error("sample should be implemented in subclasses.")}getLogits(e,r){let n=e.dims.at(-1),a=e.data;if(r===-1)a=a.slice(-n);else{let i=r*n;a=a.slice(i,i+n)}return a}randomSelect(e){let r=0;for(let a=0;a1)return new c2(e);if(e.num_return_sequences>1)throw Error(`num_return_sequences has to be 1 when doing greedy search, but is ${e.num_return_sequences}.`);return new u2(e)}}class u2 extends ss{async sample(e){const r=Kt(e.data)[1];return[[BigInt(r),0]]}}class d2 extends ss{async sample(e){let r=e.dims.at(-1);this.generation_config.top_k>0&&(r=Math.min(this.generation_config.top_k,r));const[n,a]=await Dn(e,r),i=wt(n.data);return Array.from({length:this.generation_config.num_beams},()=>{const s=this.randomSelect(i);return[a.data[s],Math.log(i[s])]})}}class c2 extends ss{async sample(e){let r=e.dims.at(-1);this.generation_config.top_k>0&&(r=Math.min(this.generation_config.top_k,r));const[n,a]=await Dn(e,r),i=wt(n.data);return Array.from({length:this.generation_config.num_beams},(s,o)=>[a.data[o],Math.log(i[o])])}}class p2 extends Og{return_timestamps=null;return_token_timestamps=null;num_frames=null;alignment_heads=null;task=null;language=null;no_timestamps_token_id=null;prompt_ids=null;is_multilingual=null;lang_to_id=null;task_to_id=null;max_initial_timestamp_index=1}const ve={EncoderOnly:0,EncoderDecoder:1,Seq2Seq:2,Vision2Seq:3,DecoderOnly:4,MaskGeneration:5,ImageTextToText:6,Musicgen:7},es=new Map,zg=new Map,ba=new Map;async function h2(t,e,r){let n=r.device;n&&typeof n!="string"&&(n.hasOwnProperty(e)?n=n[e]:(console.warn(`Device not specified for ${e}. Using the default device.`),n=null));const a=Sw(n);let i=r.dtype;if(typeof i!="string"&&(i&&i.hasOwnProperty(e)?i=i[e]:(i=qv[a[0]],console.warn(`Dtype not specified for ${e}. Using the default dtype: ${i}.`))),xp.hasOwnProperty(i)){if(i===Rt.fp16&&!await jv())throw new Error("The device does not support fp16.")}else throw new Error(`Invalid dtype: ${i}. Should be one of: ${Object.keys(Rt).join(", ")}`);const s=xp[i],o=`${r.subfolder??""}/${e}${s}.onnx`,l={...r.session_options};l.executionProviders??=a;const u=Ii(t,o,!0,r);let p=[];if(r.use_external_data_format){if(an.IS_NODE_ENV)throw new Error("External data format is not yet supported in Node.js");const m=`${e}${s}.onnx_data`,d=`${r.subfolder??""}/${m}`;p.push(new Promise(async(_,w)=>{const v=await Ii(t,d,!0,r);_({path:m,data:v})}))}else l.externalData!==void 0&&(p=l.externalData.map(async m=>{if(typeof m.data=="string"){const d=await Ii(t,m.data,!0,r);return{...m,data:d}}return m}));if(p.length>0&&(l.externalData=await Promise.all(p)),n==="webgpu"){const m=Ig(r.config,{prefix:"present"});if(Object.keys(m).length>0){const d={};for(const _ in m)d[_]="gpu-buffer";l.preferredOutputLocation=d}}return{buffer:await u,session_options:l}}async function Xr(t,e,r){const n=Object.keys(e),a=await Promise.all(n.map(async s=>h2(t,e[s],r))),i={};for(let s=0;s0)throw new Error(`An error occurred during model execution: "Missing the following inputs: ${n.join(", ")}.`);const a=Object.keys(e).length,i=t.inputNames.length;if(a>i){let s=Object.keys(e).filter(o=>!t.inputNames.includes(o));console.warn(`WARNING: Too many inputs were provided (${a} > ${i}). The following inputs will be ignored: "${s.join(", ")}".`)}return r}async function Fr(t,e){const r=f2(t,e);try{const n=Object.fromEntries(Object.entries(r).map(([i,s])=>[i,s.ort_tensor]));let a=await t.run(n);return a=Pg(a),a}catch(n){throw console.error(`An error occurred during model execution: "${n}".`),console.error("Inputs given to model:",r),n}}function Pg(t){for(let e in t)fg(t[e])?t[e]=new ue(t[e]):typeof t[e]=="object"&&Pg(t[e]);return t}function Rg(t){if(t instanceof ue)return t;if(t.length===0)throw Error("items must be non-empty");if(Array.isArray(t[0])){if(t.some(e=>e.length!==t[0].length))throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.");return new ue("int64",BigInt64Array.from(t.flat().map(e=>BigInt(e))),[t.length,t[0].length])}else return new ue("int64",BigInt64Array.from(t.map(e=>BigInt(e))),[1,t.length])}function Bg(t){return new ue("bool",[t],[1])}async function Sp(t,e){let{encoder_outputs:r,past_key_values:n}=e;if(!r){const l=Dr(e,t.sessions.model.inputNames);r=(await Ca(t,l)).last_hidden_state}const{input_ids:a,decoder_input_ids:i,...s}=e;return s.input_ids=i,s.encoder_hidden_states=r,t.sessions.decoder_model_merged.inputNames.includes("encoder_attention_mask")&&(s.encoder_attention_mask=e.attention_mask),await gl(t,s,!0)}async function Ca(t,e){const r=t.sessions.model,n=Object.create(null);for(const a of r.inputNames)n[a]=e[a];return r.inputNames.includes("token_type_ids")&&!n.token_type_ids&&(n.token_type_ids=new ue("int64",new BigInt64Array(n.input_ids.data.length),n.input_ids.dims)),await Fr(r,n)}async function gl(t,e,r=!1){const n=t.sessions[r?"decoder_model_merged":"model"],{past_key_values:a,...i}=e;n.inputNames.includes("use_cache_branch")&&(i.use_cache_branch=Bg(!!a)),n.inputNames.includes("position_ids")&&i.attention_mask&&!i.position_ids&&(i.position_ids=g2(i,a)),t.addPastKeyValues(i,a);const s=Dr(i,n.inputNames);return await Fr(n,s)}async function m2(t,{input_ids:e=null,attention_mask:r=null,pixel_values:n=null,position_ids:a=null,inputs_embeds:i=null,past_key_values:s=null,generation_config:o=null,logits_processor:l=null,...u}){if(!i){if(i=await t.encode_text({input_ids:e}),n&&e.dims[1]!==1){const h=await t.encode_image({pixel_values:n});({inputs_embeds:i,attention_mask:r}=t._merge_input_ids_with_image_features({image_features:h,inputs_embeds:i,input_ids:e,attention_mask:r}))}else if(s&&n&&e.dims[1]===1){const h=e.dims[1],m=Object.values(s)[0].dims.at(-2);r=gr([Ma([e.dims[0],m]),r.slice(null,[r.dims[1]-h,r.dims[1]])],1)}}return await gl(t,{inputs_embeds:i,past_key_values:s,attention_mask:r,position_ids:a,generation_config:o,logits_processor:l},!0)}function g2(t,e=null){const{input_ids:r,inputs_embeds:n,attention_mask:a}=t,[i,s]=a.dims,o=new BigInt64Array(a.data.length);for(let u=0;ui.dims[1])){if(ao==t.config.image_token_index)){const o=t.config.num_image_tokens;if(!o)throw new Error("`num_image_tokens` is missing in the model configuration.");const l=i.dims[1]-(a-o);r.input_ids=i.slice(null,[-l,null]),r.attention_mask=Ma([1,a+l])}}}return r}function _2(t,e,r,n){const{...a}=r;return r.past_key_values&&(e=e.map(s=>[s.at(-1)])),a.decoder_input_ids=Rg(e),a}class Q extends bt{main_input_name="input_ids";forward_params=["input_ids","attention_mask"];constructor(e,r){super(),this.config=e,this.sessions=r;const n=ba.get(this.constructor),a=es.get(n);this.can_generate=!1,this._forward=null,this._prepare_inputs_for_generation=null,a===ve.DecoderOnly?(this.can_generate=!0,this._forward=gl,this._prepare_inputs_for_generation=kp):a===ve.Seq2Seq||a===ve.Vision2Seq||a===ve.Musicgen?(this.can_generate=!0,this._forward=Sp,this._prepare_inputs_for_generation=_2):a===ve.EncoderDecoder?this._forward=Sp:a===ve.ImageTextToText?(this.can_generate=!0,this._forward=m2,this._prepare_inputs_for_generation=kp):this._forward=Ca,this.can_generate&&this.forward_params.push("past_key_values"),this.custom_config=this.config["transformers.js_config"]??{}}async dispose(){const e=[];for(const r of Object.values(this.sessions))r?.handler?.dispose&&e.push(r.handler.dispose());return await Promise.all(e)}static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:i=!1,revision:s="main",model_file_name:o=null,subfolder:l="onnx",device:u=null,dtype:p=null,use_external_data_format:h=null,session_options:m={}}={}){let d={progress_callback:r,config:n,cache_dir:a,local_files_only:i,revision:s,model_file_name:o,subfolder:l,device:u,dtype:p,use_external_data_format:h,session_options:m};const _=ba.get(this),w=es.get(_);d.config=await Mg.from_pretrained(e,d);let v;return w===ve.DecoderOnly?v=await Promise.all([Xr(e,{model:d.model_file_name??"model"},d),Br(e,"generation_config.json",!1,d)]):w===ve.Seq2Seq||w===ve.Vision2Seq?v=await Promise.all([Xr(e,{model:"encoder_model",decoder_model_merged:"decoder_model_merged"},d),Br(e,"generation_config.json",!1,d)]):w===ve.MaskGeneration?v=await Promise.all([Xr(e,{model:"vision_encoder",prompt_encoder_mask_decoder:"prompt_encoder_mask_decoder"},d)]):w===ve.EncoderDecoder?v=await Promise.all([Xr(e,{model:"encoder_model",decoder_model_merged:"decoder_model_merged"},d)]):w===ve.ImageTextToText?v=await Promise.all([Xr(e,{embed_tokens:"embed_tokens",vision_encoder:"vision_encoder",decoder_model_merged:"decoder_model_merged"},d),Br(e,"generation_config.json",!1,d)]):w===ve.Musicgen?v=await Promise.all([Xr(e,{model:"text_encoder",decoder_model_merged:"decoder_model_merged",encodec_decode:"encodec_decode"},d),Br(e,"generation_config.json",!1,d)]):(w!==ve.EncoderOnly&&console.warn(`Model type for '${_??n?.model_type}' not found, assuming encoder-only architecture. Please report this at https://github.com/xenova/transformers.js/issues/new/choose.`),v=await Promise.all([Xr(e,{model:d.model_file_name??"model"},d)])),new this(d.config,...v)}async _call(e){return await this.forward(e)}async forward(e){return await this._forward(this,e)}_get_logits_warper(e){const r=new Ro;return e.temperature!==null&&e.temperature!==1&&r.push(new a2(e.temperature)),e.top_k!==null&&e.top_k!==0&&r.push(new s2(e.top_k)),e.top_p!==null&&e.top_p<1&&r.push(new i2(e.top_p)),r}_get_logits_processor(e,r,n=null){const a=new Ro;if(e.repetition_penalty!==null&&e.repetition_penalty!==1&&a.push(new Jv(e.repetition_penalty)),e.no_repeat_ngram_size!==null&&e.no_repeat_ngram_size>0&&a.push(new Zv(e.no_repeat_ngram_size)),e.bad_words_ids!==null&&a.push(new r2(e.bad_words_ids,e.eos_token_id)),e.min_length!==null&&e.eos_token_id!==null&&e.min_length>0&&a.push(new e2(e.min_length,e.eos_token_id)),e.min_new_tokens!==null&&e.eos_token_id!==null&&e.min_new_tokens>0&&a.push(new t2(r,e.min_new_tokens,e.eos_token_id)),e.forced_bos_token_id!==null&&a.push(new Kv(e.forced_bos_token_id)),e.forced_eos_token_id!==null&&a.push(new Yv(e.max_length,e.forced_eos_token_id)),e.begin_suppress_tokens!==null){const i=r>1||e.forced_bos_token_id===null?r:r+1;a.push(new Xv(e.begin_suppress_tokens,i))}return e.guidance_scale!==null&&e.guidance_scale>1&&a.push(new n2(e.guidance_scale)),n!==null&&a.extend(n),a}_prepare_generation_config(e,r,n=Og){const a={...this.config};for(const s of["decoder","generator","text_config"])s in a&&Object.assign(a,a[s]);const i=new n(a);return"generation_config"in this&&Object.assign(i,this.generation_config),e&&Object.assign(i,e),r&&Object.assign(i,Dr(r,Object.getOwnPropertyNames(i))),i}_get_stopping_criteria(e,r=null){const n=new ml;return e.max_length!==null&&n.push(new o2(e.max_length,this.config.max_position_embeddings??null)),e.eos_token_id!==null&&n.push(new l2(e.eos_token_id)),r&&n.extend(r),n}_validate_model_class(){if(!this.can_generate){const e=[Sl,kl,xl,$l],r=ba.get(this.constructor),n=new Set,a=this.config.model_type;for(const s of e){const o=s.get(a);o&&n.add(o[0])}let i=`The current model class (${r}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;throw n.size>0&&(i+=` Please use the following class instead: ${[...n].join(", ")}`),Error(i)}}prepare_inputs_for_generation(...e){return this._prepare_inputs_for_generation(this,...e)}_update_model_kwargs_for_generation({generated_input_ids:e,outputs:r,model_inputs:n,is_encoder_decoder:a}){return n.past_key_values=this.getPastKeyValues(r,n.past_key_values),n.input_ids=new ue("int64",e.flat(),[e.length,1]),a||(n.attention_mask=gr([n.attention_mask,Ma([n.attention_mask.dims[0],1])],1)),n.position_ids=null,n}_prepare_model_inputs({inputs:e,bos_token_id:r,model_kwargs:n}){const a=Dr(n,this.forward_params),i=this.main_input_name;if(i in a){if(e)throw new Error("`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. Make sure to either pass {inputs} or {input_name}=...")}else a[i]=e;return{inputs_tensor:a[i],model_inputs:a,model_input_name:i}}async _prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:e,model_inputs:r,model_input_name:n,generation_config:a}){const i=Dr(r,this.sessions.model.inputNames);let{last_hidden_state:s}=await Ca(this,i);return a.guidance_scale!==null&&a.guidance_scale>1&&(s=gr([s,Pw(s,0)],0),"attention_mask"in r&&(r.attention_mask=gr([r.attention_mask,Dw(r.attention_mask)],0))),r.encoder_outputs=s,r}_prepare_decoder_input_ids_for_generation({batch_size:e,model_input_name:r,model_kwargs:n,decoder_start_token_id:a,bos_token_id:i,generation_config:s}){let{decoder_input_ids:o,...l}=n;if(!o)if(a??=i,this.config.model_type==="musicgen")o=Array.from({length:e*this.config.decoder.num_codebooks},()=>[a]);else if(Array.isArray(a)){if(a.length!==e)throw new Error(`\`decoder_start_token_id\` expcted to have length ${e} but got ${a.length}`);o=a}else o=Array.from({length:e},()=>[a]);return o=Rg(o),n.decoder_attention_mask=Rw(o),{input_ids:o,model_inputs:l}}async generate({inputs:e=null,generation_config:r=null,logits_processor:n=null,stopping_criteria:a=null,streamer:i=null,...s}){this._validate_model_class(),r=this._prepare_generation_config(r,s);let{inputs_tensor:o,model_inputs:l,model_input_name:u}=this._prepare_model_inputs({inputs:e,model_kwargs:s});const p=this.config.is_encoder_decoder;p&&("encoder_outputs"in l||(l=await this._prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:o,model_inputs:l,model_input_name:u,generation_config:r})));let h;p?{input_ids:h,model_inputs:l}=this._prepare_decoder_input_ids_for_generation({batch_size:l[u].dims.at(0),model_input_name:u,model_kwargs:l,decoder_start_token_id:r.decoder_start_token_id,bos_token_id:r.bos_token_id,generation_config:r}):h=l[u];let m=h.dims.at(-1);r.max_new_tokens!==null&&(r.max_length=m+r.max_new_tokens);const d=this._get_logits_processor(r,m,n),_=this._get_stopping_criteria(r,a),w=l[u].dims.at(0),v=ss.getSampler(r),S=new Array(w).fill(0),$=h.tolist();i&&i.put($);let E=null;for(;;){l=this.prepare_inputs_for_generation($,l,r);const A=await this.forward(l),P=A.logits.slice(null,-1,null),B=d($,P),D=[];for(let H=0;HH)){r.return_dict_in_generate&&(E=this.getPastKeyValues(A,l.past_key_values,!1));break}l=this._update_model_kwargs_for_generation({generated_input_ids:D,outputs:A,model_inputs:l,is_encoder_decoder:p})}i&&i.end();const T=new ue("int64",$.flat(),[$.length,$[0].length]);return r.return_dict_in_generate?{sequences:T,past_key_values:E}:T}addAttentionsToBeam(e,r){if(this.config.is_encoder_decoder){if(!r.cross_attentions||r.cross_attentions.length===0)throw Error("`output_attentions` is true, but the model did not produce cross-attentions. This is most likely because the model was not exported with `output_attentions=True`.");e.cross_attentions||(e.cross_attentions=[]),e.cross_attentions.push(r.cross_attentions)}if(!r.decoder_attentions||r.decoder_attentions.length===0)throw Error("`output_attentions` is true, but the model did not produce decoder-attentions. This is most likely because the model was not exported with `output_attentions=True`.");e.decoder_attentions||(e.decoder_attentions=[]),e.decoder_attentions.push(r.decoder_attentions)}groupBeams(e){const r=Object.create(null);for(const n of e)r[n.id]===void 0?r[n.id]=[n]:r[n.id].push(n);return Object.values(r)}getPastKeyValues(e,r,n=!0){const a=Object.create(null);for(const i in e)if(i.startsWith("present")){let s=i.replace("present","past_key_values");if(r&&i.includes("encoder"))a[s]=r[s];else{if(n&&r){const o=r[s];o.location==="gpu-buffer"&&o.dispose()}a[s]=e[i]}}return a}getAttentions(e){const r=Object.create(null);for(const n of["cross_attentions","decoder_attentions"]){const a=[];for(const i in e)if(i.startsWith(n)){const s=i.split(".").pop();a[s]=e[i]}r[n]=a}return r}addPastKeyValues(e,r){if(r)Object.assign(e,r);else{const n=this.custom_config.kv_cache_dtype??"float32",a=n==="float16"?new Uint16Array:[],i=Ig(this.config);for(const s in i)e[s]=new ue(n,a,i[s])}}}class Yt{}class Pa extends Q{}class y2 extends Pa{}class w2 extends Pa{async _call(e){return new $t(await super._call(e))}}class b2 extends Pa{async _call(e){return new Ae(await super._call(e))}}class v2 extends Pa{async _call(e){return new vt(await super._call(e))}}class $2 extends Pa{async _call(e){return new Ct(await super._call(e))}}class x2 extends Q{}class S2 extends x2{}class Ra extends Q{}class k2 extends Ra{}class E2 extends Ra{async _call(e){return new $t(await super._call(e))}}class C2 extends Ra{async _call(e){return new Ae(await super._call(e))}}class T2 extends Ra{async _call(e){return new vt(await super._call(e))}}class A2 extends Ra{async _call(e){return new Ct(await super._call(e))}}class Ba extends Q{}class I2 extends Ba{}class M2 extends Ba{async _call(e){return new $t(await super._call(e))}}class O2 extends Ba{async _call(e){return new Ae(await super._call(e))}}class z2 extends Ba{async _call(e){return new vt(await super._call(e))}}class P2 extends Ba{async _call(e){return new Ct(await super._call(e))}}class Da extends Q{}class R2 extends Da{}class B2 extends Da{async _call(e){return new $t(await super._call(e))}}class D2 extends Da{async _call(e){return new Ae(await super._call(e))}}class N2 extends Da{async _call(e){return new vt(await super._call(e))}}class F2 extends Da{async _call(e){return new Ct(await super._call(e))}}class Na extends Q{}class L2 extends Na{}class U2 extends Na{async _call(e){return new $t(await super._call(e))}}class W2 extends Na{async _call(e){return new Ae(await super._call(e))}}class V2 extends Na{async _call(e){return new vt(await super._call(e))}}class G2 extends Na{async _call(e){return new Ct(await super._call(e))}}class Fa extends Q{}class H2 extends Fa{}class j2 extends Fa{async _call(e){return new $t(await super._call(e))}}class q2 extends Fa{async _call(e){return new Ae(await super._call(e))}}class K2 extends Fa{async _call(e){return new vt(await super._call(e))}}class Y2 extends Fa{async _call(e){return new Ct(await super._call(e))}}class La extends Q{}class X2 extends La{}class Q2 extends La{async _call(e){return new $t(await super._call(e))}}class Z2 extends La{async _call(e){return new Ae(await super._call(e))}}class J2 extends La{async _call(e){return new vt(await super._call(e))}}class e1 extends La{async _call(e){return new Ct(await super._call(e))}}class Ua extends Q{}class t1 extends Ua{}class r1 extends Ua{async _call(e){return new Ae(await super._call(e))}}class n1 extends Ua{async _call(e){return new vt(await super._call(e))}}class a1 extends Ua{async _call(e){return new Ct(await super._call(e))}}class i1 extends Ua{async _call(e){return new $t(await super._call(e))}}class os extends Q{}class s1 extends os{}class o1 extends os{async _call(e){return new $t(await super._call(e))}}class l1 extends os{async _call(e){return new Ae(await super._call(e))}}class u1 extends os{async _call(e){return new vt(await super._call(e))}}class ls extends Q{}class d1 extends ls{}class c1 extends ls{async _call(e){return new $t(await super._call(e))}}class p1 extends ls{async _call(e){return new Ae(await super._call(e))}}class h1 extends ls{async _call(e){return new Ct(await super._call(e))}}class Wa extends Q{}class f1 extends Wa{}class m1 extends Wa{async _call(e){return new $t(await super._call(e))}}class g1 extends Wa{async _call(e){return new Ae(await super._call(e))}}class _1 extends Wa{async _call(e){return new vt(await super._call(e))}}class y1 extends Wa{async _call(e){return new Ct(await super._call(e))}}class us extends Q{}class w1 extends us{}class b1 extends us{async _call(e){return new $t(await super._call(e))}}class v1 extends us{async _call(e){return new Ae(await super._call(e))}}class $1 extends us{async _call(e){return new Ct(await super._call(e))}}class ds extends Q{}class x1 extends ds{}class S1 extends ds{async _call(e){return new Ae(await super._call(e))}}class k1 extends ds{async _call(e){return new Ct(await super._call(e))}}class E1 extends ds{async _call(e){return new $t(await super._call(e))}}class Dg extends Q{forward_params=["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}}class C1 extends Dg{}class T1 extends Dg{}class Ng extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class A1 extends Ng{}class I1 extends Ng{}class Fg extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class M1 extends Fg{}class O1 extends Fg{}class _l extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class z1 extends _l{}class P1 extends _l{}class R1 extends _l{async _call(e){return new Ae(await super._call(e))}}class cs extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class B1 extends cs{}class D1 extends cs{}class N1 extends cs{async _call(e){return new Ae(await super._call(e))}}class F1 extends cs{}class Lg extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class L1 extends Lg{}class U1 extends Lg{}class Ug extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class W1 extends Ug{}class V1 extends Ug{}class Va extends Q{}class G1 extends Va{}class H1 extends Va{async _call(e){return new $t(await super._call(e))}}class j1 extends Va{async _call(e){return new Ae(await super._call(e))}}class q1 extends Va{async _call(e){return new vt(await super._call(e))}}class K1 extends Va{async _call(e){return new Ct(await super._call(e))}}class Ga extends Q{}class Y1 extends Ga{}class X1 extends Ga{async _call(e){return new $t(await super._call(e))}}class Q1 extends Ga{async _call(e){return new Ae(await super._call(e))}}class Z1 extends Ga{async _call(e){return new vt(await super._call(e))}}class J1 extends Ga{async _call(e){return new Ct(await super._call(e))}}class Ha extends Q{}class e$ extends Ha{}class t$ extends Ha{async _call(e){return new $t(await super._call(e))}}class r$ extends Ha{async _call(e){return new Ae(await super._call(e))}}class n$ extends Ha{async _call(e){return new vt(await super._call(e))}}class a$ extends Ha{async _call(e){return new Ct(await super._call(e))}}class Wg extends Q{}class i$ extends Wg{}class s$ extends Wg{}class Vg extends Q{requires_attention_mask=!1;main_input_name="input_features";forward_params=["input_features","attention_mask","decoder_input_ids","decoder_attention_mask","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}}class o$ extends Vg{}class l$ extends Vg{_prepare_generation_config(e,r){return super._prepare_generation_config(e,r,p2)}_retrieve_init_tokens(e){const r=[e.decoder_start_token_id];let n=e.language;const a=e.task;if(e.is_multilingual){n||(console.warn("No language specified - defaulting to English (en)."),n="en");const s=`<|${_g(n)}|>`;r.push(e.lang_to_id[s]),r.push(e.task_to_id[a??"transcribe"])}else if(n||a)throw new Error("Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.");return!e.return_timestamps&&e.no_timestamps_token_id&&r.at(-1)!==e.no_timestamps_token_id?r.push(e.no_timestamps_token_id):e.return_timestamps&&r.at(-1)===e.no_timestamps_token_id&&(console.warn("<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`."),r.pop()),r.filter(i=>i!=null)}async generate({inputs:e=null,generation_config:r=null,logits_processor:n=null,stopping_criteria:a=null,...i}){r=this._prepare_generation_config(r,i);const s=this._retrieve_init_tokens(r);return r.return_timestamps&&(n??=new Ro,n.push(new Qv(r,s))),await super.generate({inputs:e,generation_config:r,logits_processor:n,decoder_input_ids:s,...i})}_extract_token_timestamps(e,r,n=null,a=.02){if(!e.cross_attentions)throw new Error("Model outputs must contain cross attentions to extract timestamps. This is most likely because the model was not exported with `output_attentions=True`.");let i=this.config.median_filter_width;i===void 0&&(console.warn("Model config has no `median_filter_width`, using default value of 7."),i=7);const s=e.cross_attentions.map(u=>{let p=Array.from({length:this.config.decoder_layers},(v,S)=>gr(u.map($=>$[S]),2)),h=ka(r.map(([v,S])=>n?p[v].slice(null,S,null,[0,n]):p[v].slice(null,S)));h=h.transpose(1,0,2,3);let[m,d]=Iw(h,-2,0,!0),_=h.clone();for(let v=0;v<_.dims[0];++v){let S=_[v];for(let $=0;$h[S+1]-h[S]),_=ft([1],d).map(v=>!!v),w=[];for(let v=0;v<_.length;++v)_[v]&&w.push(m[v]*a);l[u].data.set(w,1)}return l}}class Gg extends Q{main_input_name="pixel_values";forward_params=["pixel_values","input_ids","encoder_hidden_states","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}}class u$ extends Q{forward_params=["input_ids","pixel_values","attention_mask","position_ids","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}}class Hg extends u${async encode_image({pixel_values:e}){const r=(await Fr(this.sessions.vision_encoder,{pixel_values:e})).image_features;return this.config.num_image_tokens||(console.warn(`The number of image tokens was not set in the model configuration. Setting it to the number of features detected by the vision encoder (${r.dims[1]}).`),this.config.num_image_tokens=r.dims[1]),r}async encode_text({input_ids:e}){return(await Fr(this.sessions.embed_tokens,{input_ids:e})).inputs_embeds}_merge_input_ids_with_image_features({inputs_embeds:e,image_features:r,input_ids:n,attention_mask:a}){const i=this.config.image_token_index,o=n.tolist().map(m=>m.findIndex(d=>d==i)),l=o.every(m=>m===-1),u=o.every(m=>m!==-1);if(!l&&!u)throw new Error("Every input should contain either 0 or 1 image token.");if(l)return{inputs_embeds:e,attention_mask:a};const p=[],h=[];for(let m=0;mi*s,1);e.input_labels=new ue("int64",new BigInt64Array(a).fill(1n),n)}const r={image_embeddings:e.image_embeddings,image_positional_embeddings:e.image_positional_embeddings};return e.input_points&&(r.input_points=e.input_points),e.input_labels&&(r.input_labels=e.input_labels),e.input_boxes&&(r.input_boxes=e.input_boxes),await Fr(this.sessions.prompt_encoder_mask_decoder,r)}async _call(e){return new Yx(await super._call(e))}}class Yx extends Yt{constructor({iou_scores:e,pred_masks:r}){super(),this.iou_scores=e,this.pred_masks=r}}class A_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class Xx extends A_{}class Qx extends A_{}class I_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class Zx extends I_{}class Jx extends I_{}class cn extends Q{}class eS extends cn{}class tS extends cn{async _call(e){return new Fn(await super._call(e))}}class rS extends cn{async _call(e){return new Ae(await super._call(e))}}class nS extends cn{async _call(e){return new vt(await super._call(e))}}class wl extends Q{}class aS extends wl{}class iS extends wl{async _call(e){return new Fn(await super._call(e))}}class sS extends wl{async _call(e){return new Ae(await super._call(e))}}class hs extends Q{}class oS extends hs{}class lS extends hs{async _call(e){return new Fn(await super._call(e))}}class uS extends hs{async _call(e){return new Ae(await super._call(e))}}class dS extends hs{async _call(e){return new vt(await super._call(e))}}class bl extends Q{}class cS extends bl{}class pS extends bl{async _call(e){return new Fn(await super._call(e))}}class hS extends bl{async _call(e){return new Ae(await super._call(e))}}class fS extends cn{}class mS extends cn{async _call(e){return new Fn(await super._call(e))}}class gS extends cn{async _call(e){return new Ae(await super._call(e))}}class ja extends Q{}class _S extends ja{}class yS extends ja{async _call(e){return new Fn(await super._call(e))}}class wS extends ja{async _call(e){return new Ae(await super._call(e))}}class bS extends ja{async _call(e){return new E3(await super._call(e))}}class vS extends ja{async _call(e){return new vt(await super._call(e))}}class M_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class $S extends M_{}class xS extends M_{async generate_speech(e,r,{threshold:n=.5,minlenratio:a=0,maxlenratio:i=20,vocoder:s=null}={}){const o={input_ids:e},{encoder_outputs:l,encoder_attention_mask:u}=await Ca(this,o),p=l.dims[1]/this.config.reduction_factor,h=Math.floor(p*i),m=Math.floor(p*a),d=this.config.num_mel_bins;let _=[],w=null,v=null,S=0;for(;;){++S;const T=Bg(!!v);let A;v?A=v.output_sequence_out:A=new ue("float32",new Float32Array(d),[1,1,d]);let P={use_cache_branch:T,output_sequence:A,encoder_attention_mask:u,speaker_embeddings:r,encoder_hidden_states:l};this.addPastKeyValues(P,w),v=await Fr(this.sessions.decoder_model_merged,P),w=this.getPastKeyValues(v,w);const{prob:B,spectrum:D}=v;if(_.push(D),S>=m&&(Array.from(B.data).filter(q=>q>=n).length>0||S>=h))break}const $=gr(_),{waveform:E}=await Fr(s.sessions.model,{spectrogram:$});return{spectrogram:$,waveform:E}}}class SS extends Q{main_input_name="spectrogram"}class kS extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class ES extends kS{}class O_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class CS extends O_{}class TS extends O_{}class z_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class AS extends z_{}class IS extends z_{}class P_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class MS extends P_{}class OS extends P_{}class vl extends Q{}class zS extends vl{}class PS extends vl{static async from_pretrained(e,r={}){return r.model_file_name??="text_model",super.from_pretrained(e,r)}}class RS extends vl{static async from_pretrained(e,r={}){return r.model_file_name??="audio_model",super.from_pretrained(e,r)}}class BS extends Q{}class R_ extends BS{async _call(e){return new T3(await super._call(e))}}class B_ extends Q{}class DS extends B_{}class NS extends B_{}class D_ extends Q{constructor(e,r,n){super(e,r),this.generation_config=n}}class FS extends D_{}class LS extends D_{}class N_ extends Q{}class US extends N_{}class WS extends N_{async _call(e){return new Ae(await super._call(e))}}class F_ extends Q{forward_params=["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"];constructor(e,r,n){super(e,r),this.generation_config=n}_apply_and_filter_by_delay_pattern_mask(e){const[r,n]=e.dims,a=this.config.decoder.num_codebooks,i=n-a;let s=0;for(let u=0;u0&&m<=i&&(e.data[s++]=e.data[u])}const o=Math.floor(r/a),l=s/(o*a);return new ue(e.type,e.data.slice(0,s),[o,a,l])}prepare_inputs_for_generation(e,r,n){let a=structuredClone(e);for(let s=0;s=o&&(a[s][o]=BigInt(this.config.decoder.pad_token_id));return n.guidance_scale!==null&&n.guidance_scale>1&&(a=a.concat(a)),super.prepare_inputs_for_generation(a,r,n)}async generate(e){const r=await super.generate(e),n=this._apply_and_filter_by_delay_pattern_mask(r).unsqueeze_(0),{audio_values:a}=await Fr(this.sessions.encodec_decode,{audio_codes:n});return a}}class L_ extends Q{}class VS extends L_{}class GS extends L_{async _call(e){return new Ae(await super._call(e))}}class U_ extends Q{}class HS extends U_{}class jS extends U_{async _call(e){return new Ae(await super._call(e))}}class W_ extends Q{}class qS extends W_{}class KS extends W_{async _call(e){return new Ae(await super._call(e))}}class V_ extends Q{}class YS extends V_{}class XS extends V_{async _call(e){return new Ae(await super._call(e))}}class et{static MODEL_CLASS_MAPPINGS=null;static BASE_IF_FAIL=!1;static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:i=!1,revision:s="main",model_file_name:o=null,subfolder:l="onnx",device:u=null,dtype:p=null,use_external_data_format:h=null,session_options:m={}}={}){let d={progress_callback:r,config:n,cache_dir:a,local_files_only:i,revision:s,model_file_name:o,subfolder:l,device:u,dtype:p,use_external_data_format:h,session_options:m};if(d.config=await Mg.from_pretrained(e,d),!this.MODEL_CLASS_MAPPINGS)throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: "+this.name);for(let _ of this.MODEL_CLASS_MAPPINGS){const w=_.get(d.config.model_type);if(w)return await w[1].from_pretrained(e,d)}if(this.BASE_IF_FAIL)return console.warn(`Unknown model class "${d.config.model_type}", attempting to construct from base class.`),await Q.from_pretrained(e,d);throw Error(`Unsupported model type: ${d.config.model_type}`)}}const QS=new Map([["bert",["BertModel",y2]],["nomic_bert",["NomicBertModel",S2]],["roformer",["RoFormerModel",k2]],["electra",["ElectraModel",R2]],["esm",["EsmModel",s1]],["convbert",["ConvBertModel",I2]],["camembert",["CamembertModel",L2]],["deberta",["DebertaModel",H2]],["deberta-v2",["DebertaV2Model",X2]],["mpnet",["MPNetModel",f1]],["albert",["AlbertModel",x1]],["distilbert",["DistilBertModel",t1]],["roberta",["RobertaModel",G1]],["xlm",["XLMModel",Y1]],["xlm-roberta",["XLMRobertaModel",e$]],["clap",["ClapModel",zS]],["clip",["CLIPModel",c$]],["clipseg",["CLIPSegModel",w$]],["chinese_clip",["ChineseCLIPModel",y$]],["siglip",["SiglipModel",f$]],["mobilebert",["MobileBertModel",d1]],["squeezebert",["SqueezeBertModel",w1]],["wav2vec2",["Wav2Vec2Model",eS]],["wav2vec2-bert",["Wav2Vec2BertModel",cS]],["unispeech",["UniSpeechModel",aS]],["unispeech-sat",["UniSpeechSatModel",oS]],["hubert",["HubertModel",fS]],["wavlm",["WavLMModel",_S]],["audio-spectrogram-transformer",["ASTModel",i$]],["vits",["VitsModel",R_]],["detr",["DetrModel",mx]],["table-transformer",["TableTransformerModel",wx]],["vit",["ViTModel",J$]],["fastvit",["FastViTModel",tx]],["mobilevit",["MobileViTModel",ix]],["mobilevitv2",["MobileViTV2Model",ox]],["owlvit",["OwlViTModel",ux]],["owlv2",["Owlv2Model",cx]],["beit",["BeitModel",hx]],["deit",["DeiTModel",$x]],["convnext",["ConvNextModel",Nx]],["convnextv2",["ConvNextV2Model",Lx]],["dinov2",["Dinov2Model",Wx]],["resnet",["ResNetModel",Sx]],["swin",["SwinModel",Ex]],["swin2sr",["Swin2SRModel",Tx]],["donut-swin",["DonutSwinModel",Dx]],["yolos",["YolosModel",Gx]],["dpt",["DPTModel",Ix]],["glpn",["GLPNModel",Px]],["hifigan",["SpeechT5HifiGan",SS]],["efficientnet",["EfficientNetModel",US]],["mobilenet_v1",["MobileNetV1Model",VS]],["mobilenet_v2",["MobileNetV2Model",HS]],["mobilenet_v3",["MobileNetV3Model",qS]],["mobilenet_v4",["MobileNetV4Model",YS]]]),ZS=new Map([["t5",["T5Model",C1]],["longt5",["LongT5Model",A1]],["mt5",["MT5Model",M1]],["bart",["BartModel",z1]],["mbart",["MBartModel",B1]],["marian",["MarianModel",Xx]],["whisper",["WhisperModel",o$]],["m2m_100",["M2M100Model",Zx]],["blenderbot",["BlenderbotModel",L1]],["blenderbot-small",["BlenderbotSmallModel",W1]]]),JS=new Map([["bloom",["BloomModel",q$]],["gpt2",["GPT2Model",v$]],["gptj",["GPTJModel",C$]],["gpt_bigcode",["GPTBigCodeModel",A$]],["gpt_neo",["GPTNeoModel",x$]],["gpt_neox",["GPTNeoXModel",k$]],["codegen",["CodeGenModel",M$]],["llama",["LlamaModel",z$]],["cohere",["CohereModel",R$]],["gemma",["GemmaModel",D$]],["openelm",["OpenELMModel",F$]],["qwen2",["Qwen2Model",U$]],["phi",["PhiModel",V$]],["phi3",["Phi3Model",H$]],["mpt",["MptModel",Y$]],["opt",["OPTModel",Q$]],["mistral",["MistralModel",CS]],["starcoder2",["Starcoder2Model",AS]],["falcon",["FalconModel",MS]],["stablelm",["StableLmModel",FS]]]),$l=new Map([["speecht5",["SpeechT5ForSpeechToText",$S]],["whisper",["WhisperForConditionalGeneration",l$]]]),G_=new Map([["speecht5",["SpeechT5ForTextToSpeech",xS]]]),H_=new Map([["vits",["VitsModel",R_]],["musicgen",["MusicgenForConditionalGeneration",F_]]]),j_=new Map([["bert",["BertForSequenceClassification",b2]],["roformer",["RoFormerForSequenceClassification",C2]],["electra",["ElectraForSequenceClassification",D2]],["esm",["EsmForSequenceClassification",l1]],["convbert",["ConvBertForSequenceClassification",O2]],["camembert",["CamembertForSequenceClassification",W2]],["deberta",["DebertaForSequenceClassification",q2]],["deberta-v2",["DebertaV2ForSequenceClassification",Z2]],["mpnet",["MPNetForSequenceClassification",g1]],["albert",["AlbertForSequenceClassification",S1]],["distilbert",["DistilBertForSequenceClassification",r1]],["roberta",["RobertaForSequenceClassification",j1]],["xlm",["XLMForSequenceClassification",Q1]],["xlm-roberta",["XLMRobertaForSequenceClassification",r$]],["bart",["BartForSequenceClassification",R1]],["mbart",["MBartForSequenceClassification",N1]],["mobilebert",["MobileBertForSequenceClassification",p1]],["squeezebert",["SqueezeBertForSequenceClassification",v1]]]),q_=new Map([["bert",["BertForTokenClassification",v2]],["roformer",["RoFormerForTokenClassification",T2]],["electra",["ElectraForTokenClassification",N2]],["esm",["EsmForTokenClassification",u1]],["convbert",["ConvBertForTokenClassification",z2]],["camembert",["CamembertForTokenClassification",V2]],["deberta",["DebertaForTokenClassification",K2]],["deberta-v2",["DebertaV2ForTokenClassification",J2]],["mpnet",["MPNetForTokenClassification",_1]],["distilbert",["DistilBertForTokenClassification",n1]],["roberta",["RobertaForTokenClassification",q1]],["xlm",["XLMForTokenClassification",Z1]],["xlm-roberta",["XLMRobertaForTokenClassification",n$]]]),xl=new Map([["t5",["T5ForConditionalGeneration",T1]],["longt5",["LongT5ForConditionalGeneration",I1]],["mt5",["MT5ForConditionalGeneration",O1]],["bart",["BartForConditionalGeneration",P1]],["mbart",["MBartForConditionalGeneration",D1]],["marian",["MarianMTModel",Qx]],["m2m_100",["M2M100ForConditionalGeneration",Jx]],["blenderbot",["BlenderbotForConditionalGeneration",U1]],["blenderbot-small",["BlenderbotSmallForConditionalGeneration",V1]]]),Sl=new Map([["bloom",["BloomForCausalLM",K$]],["gpt2",["GPT2LMHeadModel",$$]],["gptj",["GPTJForCausalLM",T$]],["gpt_bigcode",["GPTBigCodeForCausalLM",I$]],["gpt_neo",["GPTNeoForCausalLM",S$]],["gpt_neox",["GPTNeoXForCausalLM",E$]],["codegen",["CodeGenForCausalLM",O$]],["llama",["LlamaForCausalLM",P$]],["cohere",["CohereForCausalLM",B$]],["gemma",["GemmaForCausalLM",N$]],["openelm",["OpenELMForCausalLM",L$]],["qwen2",["Qwen2ForCausalLM",W$]],["phi",["PhiForCausalLM",G$]],["phi3",["Phi3ForCausalLM",j$]],["mpt",["MptForCausalLM",X$]],["opt",["OPTForCausalLM",Z$]],["mbart",["MBartForCausalLM",F1]],["mistral",["MistralForCausalLM",TS]],["starcoder2",["Starcoder2ForCausalLM",IS]],["falcon",["FalconForCausalLM",OS]],["trocr",["TrOCRForCausalLM",ES]],["stablelm",["StableLmForCausalLM",LS]]]),K_=new Map([["bert",["BertForMaskedLM",w2]],["roformer",["RoFormerForMaskedLM",E2]],["electra",["ElectraForMaskedLM",B2]],["esm",["EsmForMaskedLM",o1]],["convbert",["ConvBertForMaskedLM",M2]],["camembert",["CamembertForMaskedLM",U2]],["deberta",["DebertaForMaskedLM",j2]],["deberta-v2",["DebertaV2ForMaskedLM",Q2]],["mpnet",["MPNetForMaskedLM",m1]],["albert",["AlbertForMaskedLM",E1]],["distilbert",["DistilBertForMaskedLM",i1]],["roberta",["RobertaForMaskedLM",H1]],["xlm",["XLMWithLMHeadModel",X1]],["xlm-roberta",["XLMRobertaForMaskedLM",t$]],["mobilebert",["MobileBertForMaskedLM",c1]],["squeezebert",["SqueezeBertForMaskedLM",b1]]]),Y_=new Map([["bert",["BertForQuestionAnswering",$2]],["roformer",["RoFormerForQuestionAnswering",A2]],["electra",["ElectraForQuestionAnswering",F2]],["convbert",["ConvBertForQuestionAnswering",P2]],["camembert",["CamembertForQuestionAnswering",G2]],["deberta",["DebertaForQuestionAnswering",Y2]],["deberta-v2",["DebertaV2ForQuestionAnswering",e1]],["mpnet",["MPNetForQuestionAnswering",y1]],["albert",["AlbertForQuestionAnswering",k1]],["distilbert",["DistilBertForQuestionAnswering",a1]],["roberta",["RobertaForQuestionAnswering",K1]],["xlm",["XLMForQuestionAnswering",J1]],["xlm-roberta",["XLMRobertaForQuestionAnswering",a$]],["mobilebert",["MobileBertForQuestionAnswering",h1]],["squeezebert",["SqueezeBertForQuestionAnswering",$1]]]),kl=new Map([["vision-encoder-decoder",["VisionEncoderDecoderModel",Gg]]]),e3=new Map([["llava",["LlavaForConditionalGeneration",Hg]],["moondream1",["Moondream1ForConditionalGeneration",d$]]]),t3=new Map([["vision-encoder-decoder",["VisionEncoderDecoderModel",Gg]]]),X_=new Map([["vit",["ViTForImageClassification",ex]],["fastvit",["FastViTForImageClassification",rx]],["mobilevit",["MobileViTForImageClassification",sx]],["mobilevitv2",["MobileViTV2ForImageClassification",lx]],["beit",["BeitForImageClassification",fx]],["deit",["DeiTForImageClassification",xx]],["convnext",["ConvNextForImageClassification",Fx]],["convnextv2",["ConvNextV2ForImageClassification",Ux]],["dinov2",["Dinov2ForImageClassification",Vx]],["resnet",["ResNetForImageClassification",kx]],["swin",["SwinForImageClassification",Cx]],["segformer",["SegformerForImageClassification",DS]],["efficientnet",["EfficientNetForImageClassification",WS]],["mobilenet_v1",["MobileNetV1ForImageClassification",GS]],["mobilenet_v2",["MobileNetV2ForImageClassification",jS]],["mobilenet_v3",["MobileNetV3ForImageClassification",KS]],["mobilenet_v4",["MobileNetV4ForImageClassification",XS]]]),Q_=new Map([["detr",["DetrForObjectDetection",gx]],["table-transformer",["TableTransformerForObjectDetection",bx]],["yolos",["YolosForObjectDetection",Hx]]]),Z_=new Map([["owlvit",["OwlViTForObjectDetection",dx]],["owlv2",["Owlv2ForObjectDetection",px]]]),J_=new Map([["detr",["DetrForSegmentation",_x]],["clipseg",["CLIPSegForImageSegmentation",b$]]]),e0=new Map([["segformer",["SegformerForSemanticSegmentation",NS]]]),r3=new Map([["sam",["SamModel",Kx]]]),t0=new Map([["wav2vec2",["Wav2Vec2ForCTC",tS]],["wav2vec2-bert",["Wav2Vec2BertForCTC",pS]],["unispeech",["UniSpeechForCTC",iS]],["unispeech-sat",["UniSpeechSatForCTC",lS]],["wavlm",["WavLMForCTC",yS]],["hubert",["HubertForCTC",mS]]]),r0=new Map([["wav2vec2",["Wav2Vec2ForSequenceClassification",rS]],["wav2vec2-bert",["Wav2Vec2BertForSequenceClassification",hS]],["unispeech",["UniSpeechForSequenceClassification",sS]],["unispeech-sat",["UniSpeechSatForSequenceClassification",uS]],["wavlm",["WavLMForSequenceClassification",wS]],["hubert",["HubertForSequenceClassification",gS]],["audio-spectrogram-transformer",["ASTForAudioClassification",s$]]]),n3=new Map([["wavlm",["WavLMForXVector",bS]]]),a3=new Map([["unispeech-sat",["UniSpeechSatForAudioFrameClassification",dS]],["wavlm",["WavLMForAudioFrameClassification",vS]],["wav2vec2",["Wav2Vec2ForAudioFrameClassification",nS]]]),i3=new Map([["vitmatte",["VitMatteForImageMatting",ax]]]),n0=new Map([["swin2sr",["Swin2SRForImageSuperResolution",Ax]]]),a0=new Map([["dpt",["DPTForDepthEstimation",Mx]],["depth_anything",["DepthAnythingForDepthEstimation",zx]],["glpn",["GLPNForDepthEstimation",Rx]]]),i0=new Map([["clip",["CLIPVisionModelWithProjection",h$]],["siglip",["SiglipVisionModel",g$]]]),s0=[[QS,ve.EncoderOnly],[ZS,ve.EncoderDecoder],[JS,ve.DecoderOnly],[j_,ve.EncoderOnly],[q_,ve.EncoderOnly],[xl,ve.Seq2Seq],[$l,ve.Seq2Seq],[Sl,ve.DecoderOnly],[K_,ve.EncoderOnly],[Y_,ve.EncoderOnly],[kl,ve.Vision2Seq],[e3,ve.ImageTextToText],[X_,ve.EncoderOnly],[J_,ve.EncoderOnly],[e0,ve.EncoderOnly],[i3,ve.EncoderOnly],[n0,ve.EncoderOnly],[a0,ve.EncoderOnly],[Q_,ve.EncoderOnly],[Z_,ve.EncoderOnly],[r3,ve.MaskGeneration],[t0,ve.EncoderOnly],[r0,ve.EncoderOnly],[G_,ve.Seq2Seq],[H_,ve.EncoderOnly],[n3,ve.EncoderOnly],[a3,ve.EncoderOnly],[i0,ve.EncoderOnly]];for(const[t,e]of s0)for(const[r,n]of t.values())es.set(r,e),ba.set(n,r),zg.set(r,n);const s3=[["MusicgenForConditionalGeneration",F_,ve.Musicgen],["CLIPTextModelWithProjection",p$,ve.EncoderOnly],["SiglipTextModel",m$,ve.EncoderOnly],["ClapTextModelWithProjection",PS,ve.EncoderOnly],["ClapAudioModelWithProjection",RS,ve.EncoderOnly]];for(const[t,e,r]of s3)es.set(t,r),ba.set(e,t),zg.set(t,e);class ma extends et{static MODEL_CLASS_MAPPINGS=s0.map(e=>e[0]);static BASE_IF_FAIL=!0}class Ep extends et{static MODEL_CLASS_MAPPINGS=[j_]}class o3 extends et{static MODEL_CLASS_MAPPINGS=[q_]}class ho extends et{static MODEL_CLASS_MAPPINGS=[xl]}class l3 extends et{static MODEL_CLASS_MAPPINGS=[$l]}class u3 extends et{static MODEL_CLASS_MAPPINGS=[G_]}class d3 extends et{static MODEL_CLASS_MAPPINGS=[H_]}class c3 extends et{static MODEL_CLASS_MAPPINGS=[Sl]}class p3 extends et{static MODEL_CLASS_MAPPINGS=[K_]}class h3 extends et{static MODEL_CLASS_MAPPINGS=[Y_]}class f3 extends et{static MODEL_CLASS_MAPPINGS=[kl]}class m3 extends et{static MODEL_CLASS_MAPPINGS=[X_]}class g3 extends et{static MODEL_CLASS_MAPPINGS=[J_]}class _3 extends et{static MODEL_CLASS_MAPPINGS=[e0]}class y3 extends et{static MODEL_CLASS_MAPPINGS=[Q_]}class w3 extends et{static MODEL_CLASS_MAPPINGS=[Z_]}class b3 extends et{static MODEL_CLASS_MAPPINGS=[t0]}class v3 extends et{static MODEL_CLASS_MAPPINGS=[r0]}class $3 extends et{static MODEL_CLASS_MAPPINGS=[t3]}class x3 extends et{static MODEL_CLASS_MAPPINGS=[n0]}class S3 extends et{static MODEL_CLASS_MAPPINGS=[a0]}class k3 extends et{static MODEL_CLASS_MAPPINGS=[i0]}class Ae extends Yt{constructor({logits:e}){super(),this.logits=e}}class E3 extends Yt{constructor({logits:e,embeddings:r}){super(),this.logits=e,this.embeddings=r}}class vt extends Yt{constructor({logits:e}){super(),this.logits=e}}class $t extends Yt{constructor({logits:e}){super(),this.logits=e}}class Ct extends Yt{constructor({start_logits:e,end_logits:r}){super(),this.start_logits=e,this.end_logits=r}}class Fn extends Yt{constructor({logits:e}){super(),this.logits=e}}class C3 extends Yt{constructor({alphas:e}){super(),this.alphas=e}}class T3 extends Yt{constructor({waveform:e,spectrogram:r}){super(),this.waveform=e,this.spectrogram=r}}const Gt=typeof self<"u",A3=Gt&&self.constructor.name==="DedicatedWorkerGlobalScope";let Qr,o0,Rr;if(Gt)Qr=(t,e)=>{if(!self.OffscreenCanvas)throw new Error("OffscreenCanvas not supported by this browser.");return new self.OffscreenCanvas(t,e)},Rr=self.createImageBitmap,o0=self.ImageData;else if(Je)Rr=async t=>{const r=(await t.metadata()).channels,{data:n,info:a}=await t.rotate().raw().toBuffer({resolveWithObject:!0}),i=new yt(new Uint8ClampedArray(n),a.width,a.height,a.channels);return r!==void 0&&r!==a.channels&&i.convert(r),i};else throw new Error("Unable to load image processing library.");const I3={0:"nearest",1:"lanczos",2:"bilinear",3:"bicubic",4:"box",5:"hamming"},M3=new Map([["png","image/png"],["jpg","image/jpeg"],["jpeg","image/jpeg"],["gif","image/gif"]]);class yt{constructor(e,r,n,a){this.data=e,this.width=r,this.height=n,this.channels=a}get size(){return[this.width,this.height]}static async read(e){if(e instanceof yt)return e;if(typeof e=="string"||e instanceof URL)return await this.fromURL(e);throw new Error(`Unsupported input type: ${typeof e}`)}static fromCanvas(e){if(!Gt)throw new Error("fromCanvas() is only supported in browser environments.");const n=e.getContext("2d").getImageData(0,0,e.width,e.height).data;return new yt(n,e.width,e.height,4)}static async fromURL(e){const r=await Ui(e);if(r.status!==200)throw new Error(`Unable to read image from "${e}" (${r.status} ${r.statusText})`);const n=await r.blob();return this.fromBlob(n)}static async fromBlob(e){if(Gt){const r=await Rr(e),n=Qr(r.width,r.height).getContext("2d");return n.drawImage(r,0,0),new this(n.getImageData(0,0,r.width,r.height).data,r.width,r.height,4)}else{const r=Je(await e.arrayBuffer());return await Rr(r)}}static fromTensor(e,r="CHW"){if(e.dims.length!==3)throw new Error(`Tensor should have 3 dimensions, but has ${e.dims.length} dimensions.`);if(r==="CHW")e=e.transpose(1,2,0);else if(r!=="HWC")throw new Error(`Unsupported channel format: ${r}`);if(!(e.data instanceof Uint8ClampedArray||e.data instanceof Uint8Array))throw new Error(`Unsupported tensor type: ${e.type}`);switch(e.dims[2]){case 1:case 2:case 3:case 4:return new yt(e.data,e.dims[1],e.dims[0],e.dims[2]);default:throw new Error(`Unsupported number of channels: ${e.dims[2]}`)}}grayscale(){if(this.channels===1)return this;const e=new Uint8ClampedArray(this.width*this.height*1);switch(this.channels){case 3:case 4:for(let r=0,n=0;r=0?l=n:p=-n,a>=0?u=a:h=-a,o.drawImage(s,l,u,e,r,p,h,e,r),new yt(o.getImageData(0,0,e,r).data,e,r,4).convert(i)}else{let i=this.toSharp();if(n>=0&&a>=0)i=i.extract({left:Math.floor(n),top:Math.floor(a),width:e,height:r});else if(n<=0&&a<=0){const s=Math.floor(-a),o=Math.floor(-n);i=i.extend({top:s,left:o,right:e-this.width-o,bottom:r-this.height-s})}else{let s=[0,0],o=0;a<0?(s[0]=Math.floor(-a),s[1]=r-this.height-s[0]):o=Math.floor(a);let l=[0,0],u=0;n<0?(l[0]=Math.floor(-n),l[1]=e-this.width-l[0]):u=Math.floor(n),i=i.extend({top:s[0],bottom:s[1],left:l[0],right:l[1]}).extract({left:u,top:o,width:e,height:r})}return await Rr(i)}}async toBlob(e="image/png",r=1){if(!Gt)throw new Error("toBlob() is only supported in browser environments.");return await this.toCanvas().convertToBlob({type:e,quality:r})}toTensor(e="CHW"){let r=new ue("uint8",new Uint8Array(this.data),[this.height,this.width,this.channels]);if(e!=="HWC")if(e==="CHW")r=r.permute(2,0,1);else throw new Error(`Unsupported channel format: ${e}`);return r}toCanvas(){if(!Gt)throw new Error("toCanvas() is only supported in browser environments.");const e=this.clone().rgba(),r=Qr(e.width,e.height),n=new o0(e.data,e.width,e.height);return r.getContext("2d").putImageData(n,0,0),r}_update(e,r,n,a=null){return this.data=e,this.width=r,this.height=n,a!==null&&(this.channels=a),this}clone(){return new yt(this.data.slice(),this.width,this.height,this.channels)}convert(e){if(this.channels===e)return this;switch(e){case 1:this.grayscale();break;case 3:this.rgb();break;case 4:this.rgba();break;default:throw new Error(`Conversion failed due to unsupported number of channels: ${this.channels}`)}return this}async save(e){if(Gt){if(A3)throw new Error("Unable to save an image from a Web Worker.");const r=e.split(".").pop().toLowerCase(),n=M3.get(r)??"image/png",a=await this.toBlob(n),i=URL.createObjectURL(a),s=document.createElement("a");s.href=i,s.download=e,s.click(),s.remove()}else{if(zt.useFS)return await this.toSharp().toFile(e);throw new Error("Unable to save the image because filesystem is disabled in this environment.")}}toSharp(){if(Gt)throw new Error("toSharp() is only supported in server-side environments.");return Je(this.data,{raw:{width:this.width,height:this.height,channels:this.channels}})}}async function O3(t,e){if(typeof AudioContext>"u")throw Error("Unable to load audio from path/URL since `AudioContext` is not available in your environment. Instead, audio data should be passed directly to the pipeline/processor. For more information and some example code, see https://huggingface.co/docs/transformers.js/guides/node-audio-processing.");const r=await(await Ui(t)).arrayBuffer(),n=new AudioContext({sampleRate:e});typeof e>"u"&&console.warn(`No sampling rate provided, using default of ${n.sampleRate}Hz.`);const a=await n.decodeAudioData(r);let i;if(a.numberOfChannels===2){const s=Math.sqrt(2),o=a.getChannelData(0),l=a.getChannelData(1);i=new Float32Array(o.length);for(let u=0;u2595*Math.log10(1+t/700),kaldi:t=>1127*Math.log(1+t/700),slaney:(t,e=1e3,r=15,n=27/Math.log(6.4))=>t>=e?r+Math.log(t/e)*n:3*t/200};function fo(t,e="htk"){const r=z3[e];if(!r)throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');return typeof t=="number"?r(t):t.map(n=>r(n))}const P3={htk:t=>700*(10**(t/2595)-1),kaldi:t=>700*(Math.exp(t/1127)-1),slaney:(t,e=1e3,r=15,n=Math.log(6.4)/27)=>t>=r?e*Math.exp(n*(t-r)):200*t/3};function R3(t,e="htk"){const r=P3[e];if(!r)throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');return typeof t=="number"?r(t):t.map(n=>r(n))}function B3(t,e){const r=Float64Array.from({length:e.length-1},(s,o)=>e[o+1]-e[o]),n=Array.from({length:t.length},()=>new Array(e.length));for(let s=0;snew Array(t.length));for(let s=0;st+n*i)}function Ta(t,e,r,n,a,i=null,s="htk",o=!1){if(i!==null&&i!=="slaney")throw new Error('norm must be one of null or "slaney"');const l=fo(r,s),u=fo(n,s),p=Tp(l,u,e+2);let h=R3(p,s),m;if(o){const _=a/(t*2);m=fo(Float64Array.from({length:t},(w,v)=>v*_),s),h=p}else m=Tp(0,Math.floor(a/2),t);const d=B3(m,h);if(i!==null&&i==="slaney")for(let _=0;_a)throw Error(`frame_length (${r}) may not be larger than fft_length (${a})`);if(T!==r)throw new Error(`Length of the window (${T}) must equal frame_length (${r})`);if(n<=0)throw new Error("hop_length must be greater than zero");if(i===null&&p!==null)throw new Error("You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram. Specify `power` to fix this issue.");if(s){if(o!=="reflect")throw new Error(`pad_mode="${o}" not implemented yet.`);const R=Math.floor((a-1)/2)+1;t=D3(t,R,R)}const A=Math.floor(1+Math.floor((t.length-r)/n)),P=l?Math.floor(a/2)+1:a;let B=A,D=A;S!==null&&(S>A?$&&(D=S):D=B=S);const q=new V0(a),H=new Float64Array(a),ie=new Float64Array(q.outputBufferSize),te=new Float32Array(P*D);for(let R=0;R=1;--re)H[re]-=u*H[re-1];H[0]*=1-u}for(let re=0;reMath.pow(o,.85));break;default:throw new Error(`Unknown window type ${e}.`)}if(r&&(s=s.subarray(0,t)),n===null)return s;if(t>n)throw new Error(`Length of the window (${t}) may not be larger than frame_length (${n})`);return s}function L3([t,e,r,n]){return[t-r/2,e-n/2,t+r/2,e+n/2]}function El(t,e=.5,r=null,n=!1){const a=t.logits,i=t.pred_boxes,[s,o,l]=a.dims;if(r!==null&&r.length!==s)throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits");let u=[];for(let p=0;pe&&S.push(E)}else{let E=Kt(v.data)[1];if(E===l-1||($=wt(v.data),$[E]A*h[(P+1)%2])),m.boxes.push(T),m.classes.push(E),m.scores.push($[E])}}u.push(m)}return u}function qa(t,e){if(!(t instanceof Float32Array||t instanceof Float64Array))throw new Error(`${e} expects input to be a Float32Array or a Float64Array, but got ${t?.constructor?.name??typeof t} instead. If using the feature extractor directly, remember to use \`read_audio(url, sampling_rate)\` to obtain the raw audio data of the file/url.`)}function Ap(t,e,r=0,n=null){const a=t/e;let i=H0(a)*e;return n!==null&&i>n&&(i=Math.floor(a)*e),ii?u=Math.floor(i*l/a):i>a&&(l=Math.floor(a*u/i)),await e.resize(u,l,{resample:n}))}async crop_margin(e,r=200){const n=e.clone().grayscale(),a=Fp(n.data)[0],s=Kt(n.data)[0]-a;if(s===0)return e;const o=r/255;let l=n.width,u=n.height,p=0,h=0;const m=n.data;for(let d=0;dthis.preprocess(i)));return{pixel_values:ka(n.map(i=>i.pixel_values),0),original_sizes:n.map(i=>i.original_size),reshaped_input_sizes:n.map(i=>i.reshaped_input_size)}}}class U3 extends He{post_process_semantic_segmentation(e,r=null){const n=e.logits,a=n.dims[0];if(r!==null&&r.length!==a)throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits");const i=[];for(let s=0;sm[E]&&(m[E]=$[E],d[E]=S)}const _=new Array(l.dims[0]),w=h.data;for(let S=0;SS!==void 0);i.push({segmentation:h,labels:v})}return i}}class u0 extends He{}class W3 extends u0{}class V3 extends He{}class G3 extends He{}class d0 extends He{}class H3 extends d0{}class j3 extends He{}class q3 extends He{}class c0 extends He{constructor(e){super(e),this.crop_pct=this.config.crop_pct??224/256}async resize(e){const r=this.size?.shortest_edge;if(r===void 0)throw new Error("Size dictionary must contain 'shortest_edge' key.");if(r<384){const n=Math.floor(r/this.crop_pct),[a,i]=this.get_resize_output_image_size(e,{shortest_edge:n});e=await e.resize(a,i,{resample:this.resample}),e=await e.center_crop(r,r)}else e=await e.resize(r,r,{resample:this.resample});return e}}class K3 extends c0{}class Y3 extends He{}class X3 extends He{}class Q3 extends He{constructor(e){super(e),this.include_top=this.config.include_top??!0,this.include_top&&(this.image_std=this.image_std.map(r=>r*r))}}class Z3 extends He{}class J3 extends He{}class ek extends He{}class tk extends He{}class p0 extends He{}class rk extends p0{}class h0 extends He{post_process_object_detection(...e){return El(...e)}}class nk extends h0{}class ak extends He{}class ik extends He{}class f0 extends He{pad_image(e,r,n,a={}){const[i,s,o]=r;let l=this.image_mean;Array.isArray(this.image_mean)||(l=new Array(o).fill(l));let u=this.image_std;Array.isArray(u)||(u=new Array(o).fill(l));const p=l.map((h,m)=>-h/u[m]);return super.pad_image(e,r,n,{center:!0,constant_values:p,...a})}}class sk extends f0{}class ok extends He{async _call(e){const r=await super._call(e),n=[r.pixel_values.dims[0],64,64],a=new ue("int64",new BigInt64Array(n.reduce((i,s)=>i*s)).fill(1n),n);return{...r,pixel_mask:a}}post_process_object_detection(...e){return El(...e)}remove_low_and_no_objects(e,r,n,a){let i=[],s=[],o=[];for(let l=0;ln&&(i.push(p),s.push(d),o.push(h))}return[i,s,o]}check_segment_validity(e,r,n,a=.5,i=.8){let s=[],o=0,l=0;const u=r[n].data;for(let h=0;h=a&&++l;let p=o>0&&l>0;return p&&(p=o/l>i),[p,s]}compute_segments(e,r,n,a,i,s=null,o=null){let[l,u]=o??e[0].dims,p=new ue("int32",new Int32Array(l*u),[l,u]),h=[];if(o!==null)for(let v=0;vd[E]&&(m[E]=v,d[E]=$[E])}let _=0;const w=p.data;for(let v=0;va!==r.dims[i]))throw Error(`The first ${n.length} dimensions of 'input_points' and 'input_labels' must be the same.`);return new ue("int64",e.flat(1/0).map(BigInt),n)}async _call(e,{input_points:r=null,input_labels:n=null,input_boxes:a=null}={}){const i=await super._call(e);if(r&&(i.input_points=this.reshape_input_points(r,i.original_sizes,i.reshaped_input_sizes)),n){if(!i.input_points)throw Error("`input_points` must be provided if `input_labels` are provided.");i.input_labels=this.add_input_labels(n,i.input_points)}return a&&(i.input_boxes=this.reshape_input_points(a,i.original_sizes,i.reshaped_input_sizes,!0)),i}async post_process_masks(e,r,n,{mask_threshold:a=0,binarize:i=!0,pad_size:s=null}={}){const o=[];s=s??this.pad_size;const l=[s.height,s.width];for(let u=0;ua&&(_[w]=1);m=new ue("bool",_,m.dims)}o.push(m)}return o}generate_crop_boxes(e,r,{crop_n_layers:n=0,overlap_ratio:a=512/1500,points_per_crop:i=32,crop_n_points_downscale_factor:s=1}={}){}}class dk extends He{pad_image(e,r,n,a={}){const[i,s,o]=r;return super.pad_image(e,r,{width:s+(n-s%n)%n,height:i+(n-i%n)%n},{mode:"symmetric",center:!1,constant_values:-1,...a})}}class ck extends He{async _call(e,r){Array.isArray(e)||(e=[e]),Array.isArray(r)||(r=[r]);const n=await Promise.all(e.map(s=>this.preprocess(s))),a=await Promise.all(r.map(s=>this.preprocess(s,{do_normalize:!1,do_convert_rgb:!1,do_convert_grayscale:!0})));return{pixel_values:ka(n.map((s,o)=>gr([s.pixel_values,a[o].pixel_values],0)),0),original_sizes:n.map(s=>s.original_size),reshaped_input_sizes:n.map(s=>s.reshaped_input_size)}}}class pk extends pn{constructor(e){super(e),this.config.mel_filters??=Ta(Math.floor(1+this.config.n_fft/2),this.config.feature_size,0,8e3,this.config.sampling_rate,"slaney","slaney"),this.window=ms(this.config.n_fft,"hann")}async _extract_fbank_features(e){const r=await fs(e,this.window,this.config.n_fft,this.config.hop_length,{power:2,mel_filters:this.config.mel_filters,log_mel:"log10",max_num_frames:this.config.nb_max_frames}),n=r.data,a=Kt(n)[0];for(let i=0;ithis.config.n_samples?(console.warn("Attempting to extract features for audio longer than 30 seconds. If using a pipeline to extract transcript from a long audio clip, remember to specify `chunk_length_s` and/or `stride_length_s`."),r=e.slice(0,this.config.n_samples)):(r=new Float32Array(this.config.n_samples),r.set(e)),{input_features:(await this._extract_fbank_features(r)).unsqueeze_(0)}}}class hk extends pn{_zero_mean_unit_var_norm(e){const n=e.reduce((i,s)=>i+s,0)/e.length,a=e.reduce((i,s)=>i+(s-n)**2,0)/e.length;return e.map(i=>(i-n)/Math.sqrt(a+1e-7))}async _call(e){qa(e,"Wav2Vec2FeatureExtractor"),e instanceof Float64Array&&(e=new Float32Array(e));let r=e;this.config.do_normalize&&(r=this._zero_mean_unit_var_norm(r));const n=[1,r.length];return{input_values:new ue("float32",r,n),attention_mask:new ue("int64",new BigInt64Array(r.length).fill(1n),n)}}}class fk extends pn{constructor(e){super(e);const r=this.config.sampling_rate,n=Ta(256,this.config.num_mel_bins,20,Math.floor(r/2),r,null,"kaldi",!0);for(let a=0;an*32768),fs(e,this.window,400,160,{fft_length:512,power:2,center:!1,preemphasis:.97,mel_filters:this.mel_filters,log_mel:"log",mel_floor:1192092955078125e-22,remove_dc_offset:!0,max_num_frames:r,transpose:!0})}async _call(e,{padding:r=!0,pad_to_multiple_of:n=2,do_normalize_per_mel_bins:a=!0,return_attention_mask:i=!0}={}){qa(e,"SeamlessM4TFeatureExtractor");let s=await this._extract_fbank_features(e,this.config.max_length);if(a){const[_,w]=s.dims,v=s.data;for(let S=0;S0){const $=new Float32Array(w*(_+S));$.set(v),$.fill(this.config.padding_value,v.length);const E=_+S;s=new ue(s.type,$,[E,w]),i&&(o=new ue("int64",new BigInt64Array(E),[1,E]),o.data.fill(1n,0,_))}}const[l,u]=s.dims,p=this.config.stride;if(l%p!==0)throw new Error(`The number of frames (${l}) must be a multiple of the stride (${p}).`);const m=s.view(1,Math.floor(l/p),u*p),d={input_features:m};if(i){const _=m.dims[1],w=new BigInt64Array(_);if(o){const v=o.data;for(let S=1,$=0;S0)if(n==="rand_trunc"){const o=Math.floor(Math.random()*(s+1));e=e.subarray(o,o+r),i=await this._extract_fbank_features(e,this.mel_filters_slaney,this.config.nb_max_samples)}else throw new Error(`Truncation strategy "${n}" not implemented`);else{if(s<0){let o=new Float64Array(r);if(o.set(e),a==="repeat")for(let l=e.length;lyt.read(e)))}async function ts(t,e){return Array.isArray(t)||(t=[t]),await Promise.all(t.map(r=>typeof r=="string"||r instanceof URL?O3(r,e):r instanceof Float64Array?new Float32Array(r):r))}function m0(t,e){e&&(t=t.map(s=>s|0));const[r,n,a,i]=t;return{xmin:r,ymin:n,xmax:a,ymax:i}}class tt extends bt{constructor({task:e,model:r,tokenizer:n=null,processor:a=null}){super(),this.task=e,this.model=r,this.tokenizer=n,this.processor=a}async dispose(){await this.model.dispose()}}class xk extends tt{constructor(e){super(e)}async _call(e,{top_k:r=1}={}){const n=this.tokenizer(e,{padding:!0,truncation:!0}),a=await this.model(n),i=this.model.config.problem_type==="multi_label_classification"?l=>l.sigmoid():l=>new ue("float32",wt(l.data),l.dims),s=this.model.config.id2label,o=[];for(const l of a.logits){const u=i(l),p=await Dn(u,r),h=p[0].tolist(),d=p[1].tolist().map((_,w)=>({label:s?s[_]:`LABEL_${_}`,score:h[w]}));r===1?o.push(...d):o.push(d)}return Array.isArray(e)||r===1?o:o[0]}}class Sk extends tt{constructor(e){super(e)}async _call(e,{ignore_labels:r=["O"]}={}){const n=Array.isArray(e),a=this.tokenizer(n?e:[e],{padding:!0,truncation:!0}),s=(await this.model(a)).logits,o=this.model.config.id2label,l=[];for(let u=0;uE==this.tokenizer.sep_token_id);l[h].map((E,T)=>E==1&&(T===0||T>d&&u.findIndex(A=>A==m[T])===-1));const _=i[h].tolist(),w=s[h].tolist();for(let E=1;E<_.length;++E)(l[h]==0||E<=d||u.findIndex(T=>T==m[E])!==-1)&&(_[E]=-1/0,w[E]=-1/0);const v=wt(_).map((E,T)=>[E,T]),S=wt(w).map((E,T)=>[E,T]);v[0][0]=0,S[0][0]=0;const $=P0(v,S).filter(E=>E[0][1]<=E[1][1]).map(E=>[E[0][1],E[1][1],E[0][0]*E[1][0]]).sort((E,T)=>T[2]-E[2]);for(let E=0;E_==this.tokenizer.mask_token_id);if(u===-1)throw Error(`Mask token (${this.tokenizer.mask_token}) not found in text.`);const p=a[o][u],h=await Dn(new ue("float32",wt(p.data),p.dims),r),m=h[0].tolist(),d=h[1].tolist();i.push(d.map((_,w)=>{const v=l.slice();return v[u]=_,{score:m[w],token:Number(_),token_str:this.tokenizer.model.vocab[_],sequence:this.tokenizer.decode(v,{skip_special_tokens:!0})}}))}return Array.isArray(e)?i:i[0]}}class Cl extends tt{_key="generated_text";constructor(e){super(e)}async _call(e,r={}){Array.isArray(e)||(e=[e]),this.model.config.prefix&&(e=e.map(l=>this.model.config.prefix+l));const n=this.model.config.task_specific_params;n&&n[this.task]&&n[this.task].prefix&&(e=e.map(l=>n[this.task].prefix+l));const a=this.tokenizer,i={padding:!0,truncation:!0};let s;this instanceof g0&&"_build_translation_inputs"in a?s=a._build_translation_inputs(e,i,r):s=a(e,i);const o=await this.model.generate({...s,...r});return a.batch_decode(o,{skip_special_tokens:!0}).map(l=>({[this._key]:l}))}}class Ck extends Cl{_key="summary_text";constructor(e){super(e)}}class g0 extends Cl{_key="translation_text";constructor(e){super(e)}}class Tk extends tt{constructor(e){super(e)}async _call(e,r={}){throw new Error("This pipeline is not yet supported in Transformers.js v3.")}}class Ak extends tt{constructor(e){super(e),this.label2id=Object.fromEntries(Object.entries(this.model.config.label2id).map(([r,n])=>[r.toLowerCase(),n])),this.entailment_id=this.label2id.entailment,this.entailment_id===void 0&&(console.warn("Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."),this.entailment_id=2),this.contradiction_id=this.label2id.contradiction??this.label2id.not_entailment,this.contradiction_id===void 0&&(console.warn("Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."),this.contradiction_id=0)}async _call(e,r,{hypothesis_template:n="This example is {}.",multi_label:a=!1}={}){const i=Array.isArray(e);i||(e=[e]),Array.isArray(r)||(r=[r]);const s=r.map(u=>n.replace("{}",u)),o=a||r.length===1,l=[];for(const u of e){const p=[];for(const d of s){const _=this.tokenizer(u,{text_pair:d,padding:!0,truncation:!0}),w=await this.model(_);o?p.push([w.logits.data[this.contradiction_id],w.logits.data[this.entailment_id]]):p.push(w.logits.data[this.entailment_id])}const m=(o?p.map(d=>wt(d)[1]):wt(p)).map((d,_)=>[d,_]).sort((d,_)=>_[0]-d[0]);l.push({sequence:u,labels:m.map(d=>r[d[1]]),scores:m.map(d=>d[0])})}return i?l:l[0]}}class Ik extends tt{constructor(e){super(e)}async _call(e,{pooling:r="none",normalize:n=!1,quantize:a=!1,precision:i="binary"}={}){const s=this.tokenizer(e,{padding:!0,truncation:!0}),o=await this.model(s);let l=o.last_hidden_state??o.logits??o.token_embeddings;if(r!=="none")if(r==="mean")l=Aw(l,s.attention_mask);else if(r==="cls")l=l.slice(null,0);else throw Error(`Pooling method '${r}' not supported.`);return n&&(l=l.normalize(2,-1)),a&&(l=Nw(l,i)),l}}class Mk extends tt{constructor(e){super(e)}async _call(e,{pool:r=null}={}){const n=await Tr(e),{pixel_values:a}=await this.processor(n),i=await this.model({pixel_values:a});let s;if(r){if(!("pooler_output"in i))throw Error("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.");s=i.pooler_output}else s=i.last_hidden_state??i.logits??i.image_embeds;return s}}class Ok extends tt{constructor(e){super(e)}async _call(e,{top_k:r=5}={}){const n=this.processor.feature_extractor.config.sampling_rate,a=await ts(e,n),i=this.model.config.id2label,s=[];for(const o of a){const l=await this.processor(o),p=(await this.model(l)).logits[0],h=await Dn(new ue("float32",wt(p.data),p.dims),r),m=h[0].tolist(),_=h[1].tolist().map((w,v)=>({label:i?i[w]:`LABEL_${w}`,score:m[v]}));s.push(_)}return Array.isArray(e)?s:s[0]}}class zk extends tt{constructor(e){super(e)}async _call(e,r,{hypothesis_template:n="This is a sound of {}."}={}){const a=!Array.isArray(e);a&&(e=[e]);const i=r.map(p=>n.replace("{}",p)),s=this.tokenizer(i,{padding:!0,truncation:!0}),o=this.processor.feature_extractor.config.sampling_rate,l=await ts(e,o),u=[];for(const p of l){const h=await this.processor(p),m=await this.model({...s,...h}),d=wt(m.logits_per_audio.data);u.push([...d].map((_,w)=>({score:_,label:r[w]})))}return a?u[0]:u}}class Pk extends tt{constructor(e){super(e)}async _call(e,r={}){switch(this.model.config.model_type){case"whisper":return this._call_whisper(e,r);case"wav2vec2":case"wav2vec2-bert":case"unispeech":case"unispeech-sat":case"hubert":return this._call_wav2vec2(e,r);default:throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)}}async _call_wav2vec2(e,r){r.language&&console.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".'),r.task&&console.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');const n=!Array.isArray(e);n&&(e=[e]);const a=this.processor.feature_extractor.config.sampling_rate,i=await ts(e,a),s=[];for(const o of i){const l=await this.processor(o),p=(await this.model(l)).logits[0],h=[];for(const d of p)h.push(Kt(d.data)[1]);const m=this.tokenizer.decode(h);s.push({text:m})}return n?s[0]:s}async _call_whisper(e,r){const n=r.return_timestamps??!1,a=r.chunk_length_s??0,i=r.force_full_sequences??!1;let s=r.stride_length_s??null;n==="word"&&(r.return_token_timestamps=!0);const o=!Array.isArray(e);o&&(e=[e]);const l=this.processor.feature_extractor.config.chunk_length/this.model.config.max_source_positions,u=this.processor.feature_extractor.config.hop_length,p=this.processor.feature_extractor.config.sampling_rate,h=await ts(e,p),m=[];for(const d of h){let _=[];if(a>0){if(s===null)s=a/6;else if(a<=s)throw Error("`chunk_length_s` must be larger than `stride_length_s`.");const S=p*a,$=p*s,E=S-2*$;let T=0;for(;T=d.length;_.push({stride:[A.length,B?0:$,D?0:$],input_features:P.input_features,is_last:D}),T+=E}}else _=[{stride:[d.length,0,0],input_features:(await this.processor(d)).input_features,is_last:!0}];for(const S of _){r.num_frames=Math.floor(S.stride[0]/u);const $=await this.model.generate({inputs:S.input_features,...r});n==="word"?(S.tokens=$.sequences[0].tolist(),S.token_timestamps=$.token_timestamps.tolist()[0].map(E=>Mi(E,2))):S.tokens=$[0].tolist(),S.stride=S.stride.map(E=>E/p)}const[w,v]=this.tokenizer._decode_asr(_,{time_precision:l,return_timestamps:n,force_full_sequences:i});m.push({text:w,...v})}return o?m[0]:m}}class Rk extends tt{constructor(e){super(e)}async _call(e,r={}){const n=Array.isArray(e),a=await Tr(e),{pixel_values:i}=await this.processor(a),s=[];for(const o of i){o.dims=[1,...o.dims];const l=await this.model.generate({inputs:o,...r}),u=this.tokenizer.batch_decode(l,{skip_special_tokens:!0}).map(p=>({generated_text:p.trim()}));s.push(u)}return n?s:s[0]}}class Bk extends tt{constructor(e){super(e)}async _call(e,{top_k:r=5}={}){const n=await Tr(e),{pixel_values:a}=await this.processor(n),i=await this.model({pixel_values:a}),s=this.model.config.id2label,o=[];for(const l of i.logits){const u=await Dn(new ue("float32",wt(l.data),l.dims),r),p=u[0].tolist(),m=u[1].tolist().map((d,_)=>({label:s?s[d]:`LABEL_${d}`,score:p[_]}));o.push(m)}return Array.isArray(e)?o:o[0]}}class Dk extends tt{constructor(e){super(e),this.subtasks_mapping={panoptic:"post_process_panoptic_segmentation",instance:"post_process_instance_segmentation",semantic:"post_process_semantic_segmentation"}}async _call(e,{threshold:r=.5,mask_threshold:n=.5,overlap_mask_area_threshold:a=.8,label_ids_to_fuse:i=null,target_sizes:s=null,subtask:o=null}={}){if(Array.isArray(e)&&e.length!==1)throw Error("Image segmentation pipeline currently only supports a batch size of 1.");const u=await Tr(e),p=u.map(S=>[S.height,S.width]),{pixel_values:h,pixel_mask:m}=await this.processor(u),d=await this.model({pixel_values:h,pixel_mask:m});let _=null;if(o!==null)_=this.subtasks_mapping[o];else for(let[S,$]of Object.entries(this.subtasks_mapping))if($ in this.processor.feature_extractor){_=this.processor.feature_extractor[$].bind(this.processor.feature_extractor),o=S;break}const w=this.model.config.id2label,v=[];if(o==="panoptic"||o==="instance"){const S=_(d,r,n,a,i,s??p)[0],$=S.segmentation;for(const E of S.segments_info){const T=new Uint8ClampedArray($.data.length);for(let P=0;P<$.data.length;++P)$.data[P]===E.id&&(T[P]=255);const A=new yt(T,$.dims[1],$.dims[0],1);v.push({score:E.score,label:w[E.label_id],mask:A})}}else if(o==="semantic"){const{segmentation:S,labels:$}=_(d,s??p)[0];for(const E of $){const T=new Uint8ClampedArray(S.data.length);for(let P=0;Pn.replace("{}",m)),o=this.tokenizer(s,{padding:this.model.config.model_type==="siglip"?"max_length":!0,truncation:!0}),{pixel_values:l}=await this.processor(i),u=await this.model({...o,pixel_values:l}),p=this.model.config.model_type==="siglip"?m=>m.sigmoid().data:m=>wt(m.data),h=[];for(const m of u.logits_per_image){const _=[...p(m)].map((w,v)=>({score:w,label:r[v]}));_.sort((w,v)=>v.score-w.score),h.push(_)}return a?h:h[0]}}class Fk extends tt{constructor(e){super(e)}async _call(e,{threshold:r=.9,percentage:n=!1}={}){const a=Array.isArray(e);if(a&&e.length!==1)throw Error("Object detection pipeline currently only supports a batch size of 1.");const i=await Tr(e),s=n?null:i.map(d=>[d.height,d.width]),{pixel_values:o,pixel_mask:l}=await this.processor(i),u=await this.model({pixel_values:o,pixel_mask:l}),p=this.processor.feature_extractor.post_process_object_detection(u,r,s),h=this.model.config.id2label,m=p.map(d=>d.boxes.map((_,w)=>({score:d.scores[w],label:h[d.classes[w]],box:m0(_,!n)})));return a?m:m[0]}}class Lk extends tt{constructor(e){super(e)}async _call(e,r,{threshold:n=.1,top_k:a=null,percentage:i=!1}={}){const s=Array.isArray(e),o=await Tr(e),l=this.tokenizer(r,{padding:!0,truncation:!0}),u=await this.processor(o),p=[];for(let h=0;h({score:v.scores[E],label:r[v.classes[E]],box:m0($,!i)})).sort(($,E)=>E.score-$.score);a!==null&&(S=S.slice(0,a)),p.push(S)}return s?p:p[0]}}class Uk extends tt{constructor(e){super(e)}async _call(e,r,n={}){throw new Error("This pipeline is not yet supported in Transformers.js v3.")}}class Wk extends tt{DEFAULT_VOCODER_ID="Xenova/speecht5_hifigan";constructor(e){super(e),this.vocoder=e.vocoder??null}async _call(e,{speaker_embeddings:r=null}={}){throw new Error("This pipeline is not yet supported in Transformers.js v3.")}async _call_text_to_waveform(e){const r=this.tokenizer(e,{padding:!0,truncation:!0}),{waveform:n}=await this.model(r),a=this.model.config.sampling_rate;return{audio:n.data,sampling_rate:a}}async _call_text_to_spectrogram(e,{speaker_embeddings:r}){if(this.vocoder||(console.log("No vocoder specified, using default HifiGan vocoder."),this.vocoder=await ma.from_pretrained(this.DEFAULT_VOCODER_ID,{dtype:"fp32"})),(typeof r=="string"||r instanceof URL)&&(r=new Float32Array(await(await fetch(r)).arrayBuffer())),r instanceof Float32Array)r=new ue("float32",r,[1,r.length]);else if(!(r instanceof ue))throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.");const{input_ids:n}=this.tokenizer(e,{padding:!0,truncation:!0}),{waveform:a}=await this.model.generate_speech(n,r,{vocoder:this.vocoder}),i=this.processor.feature_extractor.config.sampling_rate;return{audio:a.data,sampling_rate:i}}}class Vk extends tt{constructor(e){super(e)}async _call(e){const r=await Tr(e),n=await this.processor(r),a=await this.model(n),i=[];for(const s of a.reconstruction){const o=s.squeeze().clamp_(0,1).mul_(255).round_().to("uint8");i.push(yt.fromTensor(o))}return i.length>1?i:i[0]}}class Gk extends tt{constructor(e){super(e)}async _call(e){const r=await Tr(e),n=await this.processor(r),{predicted_depth:a}=await this.model(n),i=[];for(let s=0;s1?i:i[0]}}const Ip=Object.freeze({"text-classification":{tokenizer:ht,pipeline:xk,model:Ep,default:{model:"Xenova/distilbert-base-uncased-finetuned-sst-2-english"},type:"text"},"token-classification":{tokenizer:ht,pipeline:Sk,model:o3,default:{model:"Xenova/bert-base-multilingual-cased-ner-hrl"},type:"text"},"question-answering":{tokenizer:ht,pipeline:kk,model:h3,default:{model:"Xenova/distilbert-base-cased-distilled-squad"},type:"text"},"fill-mask":{tokenizer:ht,pipeline:Ek,model:p3,default:{model:"Xenova/bert-base-uncased"},type:"text"},summarization:{tokenizer:ht,pipeline:Ck,model:ho,default:{model:"Xenova/distilbart-cnn-6-6"},type:"text"},translation:{tokenizer:ht,pipeline:g0,model:ho,default:{model:"Xenova/t5-small"},type:"text"},"text2text-generation":{tokenizer:ht,pipeline:Cl,model:ho,default:{model:"Xenova/flan-t5-small"},type:"text"},"text-generation":{tokenizer:ht,pipeline:Tk,model:c3,default:{model:"Xenova/gpt2"},type:"text"},"zero-shot-classification":{tokenizer:ht,pipeline:Ak,model:Ep,default:{model:"Xenova/distilbert-base-uncased-mnli"},type:"text"},"audio-classification":{pipeline:Ok,model:v3,processor:Ot,default:{model:"Xenova/wav2vec2-base-superb-ks"},type:"audio"},"zero-shot-audio-classification":{tokenizer:ht,pipeline:zk,model:ma,processor:Ot,default:{model:"Xenova/clap-htsat-unfused"},type:"multimodal"},"automatic-speech-recognition":{tokenizer:ht,pipeline:Pk,model:[l3,b3],processor:Ot,default:{model:"Xenova/whisper-tiny.en"},type:"multimodal"},"text-to-audio":{tokenizer:ht,pipeline:Wk,model:[d3,u3],processor:[Ot,null],default:{model:"Xenova/speecht5_tts"},type:"text"},"image-to-text":{tokenizer:ht,pipeline:Rk,model:f3,processor:Ot,default:{model:"Xenova/vit-gpt2-image-captioning"},type:"multimodal"},"image-classification":{pipeline:Bk,model:m3,processor:Ot,default:{model:"Xenova/vit-base-patch16-224"},type:"multimodal"},"image-segmentation":{pipeline:Dk,model:[g3,_3],processor:Ot,default:{model:"Xenova/detr-resnet-50-panoptic"},type:"multimodal"},"zero-shot-image-classification":{tokenizer:ht,pipeline:Nk,model:ma,processor:Ot,default:{model:"Xenova/clip-vit-base-patch32"},type:"multimodal"},"object-detection":{pipeline:Fk,model:y3,processor:Ot,default:{model:"Xenova/detr-resnet-50"},type:"multimodal"},"zero-shot-object-detection":{tokenizer:ht,pipeline:Lk,model:w3,processor:Ot,default:{model:"Xenova/owlvit-base-patch32"},type:"multimodal"},"document-question-answering":{tokenizer:ht,pipeline:Uk,model:$3,processor:Ot,default:{model:"Xenova/donut-base-finetuned-docvqa"},type:"multimodal"},"image-to-image":{pipeline:Vk,model:x3,processor:Ot,default:{model:"Xenova/swin2SR-classical-sr-x2-64"},type:"image"},"depth-estimation":{pipeline:Gk,model:S3,processor:Ot,default:{model:"Xenova/dpt-large"},type:"image"},"feature-extraction":{tokenizer:ht,pipeline:Ik,model:ma,default:{model:"Xenova/all-MiniLM-L6-v2"},type:"text"},"image-feature-extraction":{processor:Ot,pipeline:Mk,model:[k3,ma],default:{model:"Xenova/vit-base-patch16-224-in21k"},type:"image"}}),Hk=Object.freeze({"sentiment-analysis":"text-classification",ner:"token-classification",asr:"automatic-speech-recognition","text-to-speech":"text-to-audio",embeddings:"feature-extraction"});async function jk(t,e=null,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:i=!1,revision:s="main",device:o=null,dtype:l=null,model_file_name:u=null,session_options:p={}}={}){t=Hk[t]??t;const h=Ip[t.split("_",1)[0]];if(!h)throw Error(`Unsupported pipeline: ${t}. Must be one of [${Object.keys(Ip)}]`);e||(e=h.default.model,console.log(`No model specified. Using default model: "${e}".`));const m={progress_callback:r,config:n,cache_dir:a,local_files_only:i,revision:s,device:o,dtype:l,model_file_name:u,session_options:p},d=new Map([["tokenizer",h.tokenizer],["model",h.model],["processor",h.processor]]),_=await qk(d,e,m);_.task=t,An(r,{status:"ready",task:t,model:e});const w=h.pipeline;return new w(_)}async function qk(t,e,r){const n=Object.create(null),a=[];for(let[i,s]of t.entries()){if(!s)continue;let o;Array.isArray(s)?o=new Promise(async(l,u)=>{let p;for(let h of s){if(h===null){l(null);return}try{l(await h.from_pretrained(e,r));return}catch(m){if(m.message?.includes("Unsupported model type"))p=m;else{u(m);return}}}u(p)}):o=s.from_pretrained(e,r),n[i]=o,a.push(o)}await Promise.all(a);for(let[i,s]of Object.entries(n))n[i]=await s;return n}an.IS_PROCESS_AVAILABLE;const gs=document.getElementById("status"),Mp=document.getElementById("container"),ga=document.getElementById("video"),go=document.getElementById("overlay");gs.textContent="Loading model (16 MB)...";const Kk="onnx-community/mobilenetv4_conv_small.e2400_r224_in1k";let _0;try{_0=await jk("image-classification",Kk,{dtype:"fp32"})}catch(t){throw gs.textContent=t.message,alert(t.message),t}gs.textContent="Ready";const Yk=.1,Zr=256,Bo=document.createElement("canvas");Bo.width=Bo.height=Zr;const Op=Bo.getContext("2d",{willReadFrequently:!0});let _o=!1,yo;function y0(){_o||(_o=!0,async function(){Op.drawImage(ga,0,0,Zr,Zr);const t=Op.getImageData(0,0,Zr,Zr).data,e=new yt(t,Zr,Zr,4),r=await _0(e,{top_k:null});go.innerHTML="";for(const{label:n,score:a}of r){if(a{ga.srcObject=t,ga.play();const e=t.getVideoTracks()[0],{width:r,height:n}=e.getSettings();ga.width=r,ga.height=n;const a=r/n,[i,s]=a>720/405?[720,720/a]:[405*a,405];Mp.style.width=`${i}px`,Mp.style.height=`${s}px`,window.requestAnimationFrame(y0)}).catch(t=>{alert(t)});