model: pretrain: ckpt/charades_ego.pt freeze_vis_backbone: true freeze_txt_backbone: true num_frames: 16 text_prompt: n_ctx: 8 use_bank: true visual_prompt: num_layers: 12 prompt_dim: 512 num_tokens: 128 deep: true deep_shared: false split_st: false pt_spt: true pt_tmp: false style: VoP_c_pool # VoP_c: prompts are generated by context fusion; frame-specific attention n_seg: 16 # number of segments per video (n_seg=clip_length -> 1 frame/seg) K_s: 8 # boundary of intra-frame/inter-frame attention (VoP_f+c) pool: size: 10 data: dataset: charades_ego #root: /data/CharadesEgo/CharadesEgo_v1_480 #metadata_val: /data/CharadesEgo/CharadesEgo/CharadesEgo_v1_test_only1st.csv # all testing data root: data/charades_ego/video metadata_val: data/charades_ego/csv/{}.csv label_map: meta/charades_ego/charades_ego.json clip_length: 16 sparse_sample: true