Skip to content

Commit 71a2da9

Browse files
committed
Add missing files from earlier commit: mostly datasets, and updated typos/cleaned up some comments
1 parent 1f3253b commit 71a2da9

348 files changed

Lines changed: 2014 additions & 567 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

lavis/common/utils.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Copyright (c) 2022, salesforce.com, inc.
2+
Copyright (c) 2023, salesforce.com, inc.
33
All rights reserved.
44
SPDX-License-Identifier: BSD-3-Clause
55
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
@@ -440,3 +440,16 @@ def get_file_size(filename):
440440
"""
441441
size_in_mb = os.path.getsize(filename) / float(1024**2)
442442
return size_in_mb
443+
444+
def is_serializable(value):
445+
"""
446+
This function checks if the provided value can be serialized into a JSON string.
447+
"""
448+
try:
449+
json.dumps(value)
450+
return True
451+
except (TypeError, OverflowError):
452+
return False
453+
454+
def is_convertible_to_int(value):
455+
return bool(re.match(r'^-?\d+$', str(value)))

lavis/configs/datasets/audioset/defaults_mm_cap_instruct.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ datasets:
2828
annotations:
2929
train:
3030
url:
31-
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data//audioset/balanced_train_clean.csv
32-
# - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv
31+
# - https://storage.googleapis.com/sfr-xinstructblip-data-research/data//audioset/balanced_train_clean.csv
32+
- /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv
3333
- http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
3434
storage:
3535
- audioset/annotations/balanced_train_clean.csv

lavis/configs/datasets/iconqa/defaults_instruct.yaml

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,20 +35,20 @@ datasets:
3535
storage:
3636
- iconqa/annotations/train.json
3737
# - /export/share/datasets/vision_language/iconqa/annotations_train.json
38-
val:
39-
url:
40-
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json
41-
# - /export/share/datasets/vision_language/iconqa/annotations_val.json
42-
storage:
43-
- iconqa/annotations/val.json
44-
# - /export/share/datasets/vision_language/iconqa/annotations_val.json
45-
test:
46-
url:
47-
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json
48-
# - /export/share/datasets/vision_language/iconqa/annotations_test.json
49-
storage:
50-
- iconqa/annotations/test.json
51-
# - /export/share/datasets/vision_language/iconqa/annotations_test.json
38+
# val:
39+
# url:
40+
# - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_val.json
41+
# # - /export/share/datasets/vision_language/iconqa/annotations_val.json
42+
# storage:
43+
# - iconqa/annotations/val.json
44+
# # - /export/share/datasets/vision_language/iconqa/annotations_val.json
45+
# test:
46+
# url:
47+
# - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/iconqa/annotations_test.json
48+
# # - /export/share/datasets/vision_language/iconqa/annotations_test.json
49+
# storage:
50+
# - iconqa/annotations/test.json
51+
# # - /export/share/datasets/vision_language/iconqa/annotations_test.json
5252

5353
images:
5454
storage: /export/share/datasets/vision_language/iconqa/all_images/

lavis/configs/models/blip2/blip2_xinstruct_vicuna7b.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ model:
1515
pc_model: "ulip2_pointbert"
1616
video_model: "eva_clip_g"
1717
audio_model: "beats"
18-
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer_improved.pth
19-
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer_improved.pth
20-
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer_improved.pth
18+
pretrained_image_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/image_qformer.pth
19+
pretrained_pc_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/pc_qformer.pth
20+
pretrained_video_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/video_qformer.pth
2121
pretrained_audio_qformer: https://storage.googleapis.com/sfr-xinstructblip-data-research/model/xinstructblip_checkpoints/vicuna7b/audio_qformer_improved.pth
2222
load_attention_image_qformer: True
2323
load_attention_pc_qformer: True

lavis/datasets/builders/base_dataset_builder.py

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,93 @@ def build(self):
235235
return datasets
236236

237237

238+
class MultiModalDatasetBuilder(BaseDatasetBuilder):
239+
"""
240+
MultiModalDatasetBuilder is a utility class designed to construct datasets
241+
suitable for multi-modal tasks. This class simplifies the creation of
242+
datasets that incorporate data of multiple modalities, such as text,
243+
images, video, or audio.
244+
"""
245+
train_dataset_cls, eval_dataset_cls = None, None
246+
247+
def __init__(self, cfg=None):
248+
super().__init__(cfg)
249+
if isinstance(self.data_type, str):
250+
self.data_type = [self.data_type]
251+
252+
def _build_processor(self, cfg_name):
253+
cfg = self.config.get(cfg_name)
254+
return {
255+
split: self._build_proc_from_cfg(cfg.get(split))
256+
if cfg is not None
257+
else None
258+
for split in ['train', 'eval']
259+
}
260+
261+
def build_processors(self):
262+
self.text_processors = self._build_processor("text_processor")
263+
264+
self.processors = {
265+
split: {
266+
modality: self._build_proc_from_cfg(
267+
self.config.get(f"{'vis' if 'image' in modality else modality}_processor").get(split)
268+
)
269+
for modality in self.data_type
270+
}
271+
for split in ['train', 'eval']
272+
}
273+
274+
def _download_multimodal(self, modality):
275+
storage_path = utils.get_cache_path(self.config.build_info.get(modality).storage)
276+
if not os.path.exists(storage_path):
277+
warnings.warn(f"The specified path {storage_path} for {modality} inputs does not exist.")
278+
279+
def _download_data(self):
280+
self._download_ann()
281+
for modality in self.data_type:
282+
self._download_multimodal(modality)
283+
284+
def _get_absolute_path(self, path):
285+
if not os.path.isabs(path):
286+
return utils.get_cache_path(path)
287+
return path
288+
289+
def build(self):
290+
self.build_processors()
291+
build_info = self.config.build_info
292+
datasets = {}
293+
294+
for split, info in build_info.annotations.items():
295+
if split not in ["train", "val", "test"]:
296+
continue
297+
298+
is_train = split == "train"
299+
dataset_args = self._get_dataset_args(info, is_train)
300+
301+
dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
302+
datasets[split] = dataset_cls(**dataset_args)
303+
304+
return datasets
305+
306+
def _get_dataset_args(self, info, is_train):
307+
dataset_args = dict(self.config.build_info.get('kwargs', {}))
308+
309+
for modality in self.data_type:
310+
proc_name = f"{'vis' if 'image' in modality else modality}_processor"
311+
dataset_args[proc_name] = self.processors["train" if is_train else "eval"][modality]
312+
mm_path = self._get_absolute_path(self.config.build_info.get(modality).storage)
313+
dataset_args[f"{'vis' if 'image' in modality else modality}_root"] = mm_path
314+
315+
dataset_args['text_processor'] = self.text_processors["train" if is_train else "eval"]
316+
dataset_args["ann_paths"] = [self._get_absolute_path(path) for path in info.storage]
317+
dataset_args['modalities'] = self.data_type
318+
319+
# Conform to base
320+
for key in ['vis_processor', 'vis_root', 'test_processor']:
321+
dataset_args.setdefault(key, None)
322+
323+
return dataset_args
324+
238325
def load_dataset_config(cfg_path):
239326
cfg = OmegaConf.load(cfg_path).datasets
240-
cfg = cfg[list(cfg.keys())[0]]
241-
242-
return cfg
327+
return next(iter(cfg.values()))

0 commit comments

Comments
 (0)