Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ Requires Python 3.10+, PyTorch 2.6+.
- **Line length: 119** (not default 88) — consistent across black, isort, flake8
- Black with `skip_string_normalization = true`
- isort with `profile = black`
- Check: `python setup.py style --scope <path>`
- Fix: `python setup.py style --scope <path> --fix`
- **Incremental reformatting**: most collections are excluded from black (see `extend-exclude` in pyproject.toml). The files are reformatted when somebody makes changes to avoid a single big reformatting PR. Do not reformat files outside your changes.
- Check: `isort --check <path> && black --check <path>` or `isort --check . && black --check .`
- Fix: `isort <path> && black <path>` or `isort . && black .`
- Jupyter Notebooks are excluded from automatic black reformatting (see `extend-exclude`), but can be still reformatted when passed directly. Do not reformat notebooks outside your changes.

## Testing

Expand Down
7 changes: 6 additions & 1 deletion examples/asr/asr_adapters/scoring_and_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,12 @@ def display_results(df_all: pd.DataFrame, category: str, best_config: pd.Series,


def get_best_config(
df_exp: pd.DataFrame, dataset_type_col: str, key_info: dict, topk: int, show_analysis: bool, exp_type: str,
df_exp: pd.DataFrame,
dataset_type_col: str,
key_info: dict,
topk: int,
show_analysis: bool,
exp_type: str,
):
"""Get the best hyperparameter configuration for a given subset of experiments.

Expand Down
6 changes: 5 additions & 1 deletion examples/asr/export/transducer/infer_transducer_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@
def parse_arguments():
parser = ArgumentParser()
parser.add_argument(
"--nemo_model", type=str, default=None, required=False, help="Path to .nemo file",
"--nemo_model",
type=str,
default=None,
required=False,
help="Path to .nemo file",
)
parser.add_argument(
'--pretrained_model', type=str, default=None, required=False, help='Name of a pretrained NeMo file'
Expand Down
6 changes: 5 additions & 1 deletion examples/asr/export/transducer/infer_transducer_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,11 @@
def parse_arguments():
parser = ArgumentParser()
parser.add_argument(
"--nemo_model", type=str, default=None, required=False, help="Path to .nemo file",
"--nemo_model",
type=str,
default=None,
required=False,
help="Path to .nemo file",
)
parser.add_argument(
'--pretrained_model', type=str, default=None, required=False, help='Name of a pretrained NeMo file'
Expand Down
4 changes: 3 additions & 1 deletion examples/asr/speech_classification/vad_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ def main(cfg):
'vad_stream': True,
'sample_rate': 16000,
'manifest_filepath': manifest_vad_input,
'labels': ['infer',],
'labels': [
'infer',
],
'num_workers': cfg.num_workers,
'shuffle': False,
'window_length_in_sec': cfg.vad.parameters.window_length_in_sec,
Expand Down
9 changes: 3 additions & 6 deletions examples/tts/aligner_heteronym_disambiguation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@


def get_args():
"""Retrieve arguments for disambiguation.
"""
"""Retrieve arguments for disambiguation."""
parser = argparse.ArgumentParser("G2P disambiguation using Aligner input embedding distances.")
# TODO(jocelynh): Make this required=False with default download from NGC once ckpt uploaded
parser.add_argument('--model', required=True, type=str, help="Path to Aligner model checkpoint (.nemo file).")
Expand Down Expand Up @@ -85,8 +84,7 @@ def get_args():


def load_and_prepare_audio(aligner, audio_path, target_sr, device):
"""Loads and resamples audio to target sample rate (if necessary), and preprocesses for Aligner input.
"""
"""Loads and resamples audio to target sample rate (if necessary), and preprocesses for Aligner input."""
# Load audio and get length for preprocessing
audio_data, orig_sr = sf.read(audio_path)
if orig_sr != target_sr:
Expand Down Expand Up @@ -238,8 +236,7 @@ def disambiguate_candidates(aligner, text, spec, spec_len, confidence, device, h
def disambiguate_dataset(
aligner, manifest_path, out_path, sr, heteronyms, confidence, device, verbose, heteronyms_only=True
):
"""Disambiguates the phonemes for all words with ambiguous pronunciations in the given manifest.
"""
"""Disambiguates the phonemes for all words with ambiguous pronunciations in the given manifest."""
log_file = open('disambiguation_logs.txt', 'w') if verbose else None

with open(out_path, 'w') as f_out:
Expand Down
6 changes: 3 additions & 3 deletions external/get_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@


def process_collection(id, col):
""" Helper function processing the collection.
"""Helper function processing the collection.

Args:
id: (short) name of the collection.
col: a collection (python module).
Expand All @@ -41,7 +41,7 @@ def process_collection(id, col):


def main():
""" Main function generating a JSON file with list of NeMo collections. """
"""Main function generating a JSON file with list of NeMo collections."""
# Parse filename.
parser = argparse.ArgumentParser()
parser.add_argument('--filename', help='Name of the output JSON file', type=str, default="collections.json")
Expand Down
6 changes: 3 additions & 3 deletions external/get_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@


def process_member(name, obj, module_list):
""" Helper function processing the passed object and, if ok, adding a record to the module list.
"""Helper function processing the passed object and, if ok, adding a record to the module list.

Args:
name: name of the member
obj: member (class/function etc.)
Expand Down Expand Up @@ -74,7 +74,7 @@ def process_member(name, obj, module_list):


def main():
""" Main function analysing the indicated NeMo collection and generating a JSON file with module descriptions. """
"""Main function analysing the indicated NeMo collection and generating a JSON file with module descriptions."""
# Parse filename.
parser = argparse.ArgumentParser()
parser.add_argument('--collection', help='ID of the collection', type=str)
Expand Down
3 changes: 1 addition & 2 deletions nemo/collections/asr/data/audio_to_ctm_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@

@dataclass
class FrameCtmUnit:
"""A container class for one CTM unit with start and length countable in frames.
"""
"""A container class for one CTM unit with start and length countable in frames."""

label: str
start_frame: int
Expand Down
18 changes: 15 additions & 3 deletions nemo/collections/asr/data/audio_to_label_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@ def get_tarred_classification_label_dataset(


def get_concat_tarred_speech_label_dataset(
featurizer, config: dict, shuffle_n: int, global_rank: int, world_size: int,
featurizer,
config: dict,
shuffle_n: int,
global_rank: int,
world_size: int,
):
tarred_audio_filepaths = config['tarred_audio_filepaths']
manifest_filepaths = config['manifest_filepath']
Expand All @@ -143,7 +147,11 @@ def get_concat_tarred_speech_label_dataset(
conf['manifest_filepath'] = manifest_filepath
conf['tarred_audio_filepaths'] = tarred_audio_filepath
dataset = get_tarred_speech_label_dataset(
config=conf, featurizer=featurizer, shuffle_n=shuffle_n, global_rank=global_rank, world_size=world_size,
config=conf,
featurizer=featurizer,
shuffle_n=shuffle_n,
global_rank=global_rank,
world_size=world_size,
)
datasets.append(dataset)

Expand All @@ -160,7 +168,11 @@ def get_concat_tarred_speech_label_dataset(


def get_tarred_speech_label_dataset(
featurizer, config: dict, shuffle_n: int, global_rank: int, world_size: int,
featurizer,
config: dict,
shuffle_n: int,
global_rank: int,
world_size: int,
) -> audio_to_label.TarredAudioToSpeechLabelDataset:
"""
InInstantiates a Speech Label (e.g. VAD, speaker recognition) TarredAudioLabelDataset.
Expand Down
37 changes: 22 additions & 15 deletions nemo/collections/asr/data/feature_to_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def _feature_collate_fn(batch):
"""collate batch of feat sig, feat len, labels, labels len, assuming all features have the same shape.
Args:
batch (FloatTensor, LongTensor, LongTensor, LongTensor): A tuple of tuples of feature, feature lengths,
encoded labels, and encoded labels length.
encoded labels, and encoded labels length.
"""
packed_batch = list(zip(*batch))
if len(packed_batch) == 5:
Expand Down Expand Up @@ -61,7 +61,7 @@ def _audio_feature_collate_fn(batch, feat_pad_val, label_pad_id):
Args:
batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
LongTensor): A tuple of tuples of feature, feature lengths,
labels, and label lengths. This collate func assumes the
labels, and label lengths. This collate func assumes the
features are torch tensors of Log-Melspectrogram (i.e. [N_MEL, T]).
"""
packed_batch = list(zip(*batch))
Expand Down Expand Up @@ -178,8 +178,7 @@ class _FeatureSeqSpeakerLabelDataset(Dataset):

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
"""Returns definitions of module output ports.
"""
"""Returns definitions of module output ports."""
# TODO output type for external features
output_types = {
'external_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
Expand All @@ -197,16 +196,26 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
)
else:
output_types.update(
{'label': NeuralType(('B', 'T'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),}
{
'label': NeuralType(('B', 'T'), LabelsType()),
'label_length': NeuralType(tuple('B'), LengthsType()),
}
)

return output_types

def __init__(
self, *, manifest_filepath: str, labels: List[str], feature_loader, is_speaker_emb: bool = False,
self,
*,
manifest_filepath: str,
labels: List[str],
feature_loader,
is_speaker_emb: bool = False,
):
super().__init__()
self.collection = collections.ASRFeatureSequenceLabel(manifests_files=manifest_filepath.split(','),)
self.collection = collections.ASRFeatureSequenceLabel(
manifests_files=manifest_filepath.split(','),
)

self.feature_loader = feature_loader
self.labels = labels if labels else self.collection.uniq_labels
Expand Down Expand Up @@ -259,12 +268,12 @@ def _collate_fn(self, batch):

class FeatureToLabelDataset(Dataset):
"""
Dataset that loads tensors via a json file containing paths to feature files and their labels.
Dataset that loads tensors via a json file containing paths to feature files and their labels.
Each new line is a different sample. Example below:
and their target labels. JSON files should be of the following format:
{"feature_filepath": "/path/to/audio_feature.pt", "label": "1"}
...
{"feature_filepath": "/path/to/audio_feature.pt", "label": "0"}
{"feature_filepath": "/path/to/audio_feature.pt", "label": "0"}
Args:
manifest_filepath (str): Path to JSON containing data.
labels (Optional[list]): List of unique labels collected from all samples.
Expand All @@ -283,8 +292,7 @@ class FeatureToLabelDataset(Dataset):

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
"""Returns definitions of module output ports.
"""
"""Returns definitions of module output ports."""
output_types = {
'audio_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
'feat_length': NeuralType(tuple('B'), LengthsType()),
Expand Down Expand Up @@ -375,12 +383,12 @@ def _vad_segment_collate_fn(self, batch):

class FeatureToMultiLabelDataset(Dataset):
"""
Dataset that loads tensors via a json file containing paths to feature files and their labels.
Dataset that loads tensors via a json file containing paths to feature files and their labels.
Each new line is a different sample. Example below:
and their target labels. JSON files should be of the following format:
{"feature_filepath": "/path/to/audio_feature.pt", "label": "1 1 0 0 1"}
...
{"feature_filepath": "/path/to/audio_feature.pt", "label": "0 1 0 0"}
{"feature_filepath": "/path/to/audio_feature.pt", "label": "0 1 0 0"}
Args:
manifest_filepath (str): Path to JSON containing data.
labels (Optional[list]): List of unique labels collected from all samples.
Expand All @@ -397,8 +405,7 @@ class FeatureToMultiLabelDataset(Dataset):

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
"""Returns definitions of module output ports.
"""
"""Returns definitions of module output ports."""
output_types = {
'audio_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
'feat_length': NeuralType(tuple('B'), LengthsType()),
Expand Down
4 changes: 3 additions & 1 deletion nemo/collections/asr/data/feature_to_label_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ def get_feature_seq_speakerlabel_dataset(
An instance of FeatureToSeqSpeakerLabelDataset.
"""
dataset = feature_to_label.FeatureToSeqSpeakerLabelDataset(
manifest_filepath=config['manifest_filepath'], labels=config['labels'], feature_loader=feature_loader,
manifest_filepath=config['manifest_filepath'],
labels=config['labels'],
feature_loader=feature_loader,
)
return dataset

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@


def get_hf_audio_to_text_bpe_dataset(
config: DictConfig, global_rank: int, world_size: int, tokenizer, augmentor=None,
config: DictConfig,
global_rank: int,
world_size: int,
tokenizer,
augmentor=None,
):
if "streaming" in config and config["streaming"]:
dataset = HFIterableAudioToBPEDataset(
Expand Down Expand Up @@ -72,7 +76,10 @@ def get_hf_audio_to_text_bpe_dataset(


def get_hf_audio_to_text_char_dataset(
config: DictConfig, global_rank: int, world_size: int, augmentor=None,
config: DictConfig,
global_rank: int,
world_size: int,
augmentor=None,
):
if "streaming" in config and config["streaming"]:
dataset = HFIterableAudioToCharDataset(
Expand Down
5 changes: 2 additions & 3 deletions nemo/collections/asr/losses/angularloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,12 @@ class AngularSoftmaxLoss(Loss, Typing):
reference: https://openaccess.thecvf.com/content_CVPR_2019/papers/Deng_ArcFace_Additive_Angular_Margin_Loss_for_Deep_Face_Recognition_CVPR_2019_paper.pdf
args:
scale: scale value for cosine angle
margin: margin value added to cosine angle
margin: margin value added to cosine angle
"""

@property
def input_types(self):
"""Input types definitions for AnguarLoss.
"""
"""Input types definitions for AnguarLoss."""
return {
"logits": NeuralType(('B', 'D'), LogitsType()),
"labels": NeuralType(('B',), LabelsType()),
Expand Down
3 changes: 1 addition & 2 deletions nemo/collections/asr/losses/ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@
class CTCLoss(nn.CTCLoss, Serialization, Typing):
@property
def input_types(self):
"""Input types definitions for CTCLoss.
"""
"""Input types definitions for CTCLoss."""
return {
"log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()),
"targets": NeuralType(('B', 'T'), LabelsType()),
Expand Down
9 changes: 3 additions & 6 deletions nemo/collections/asr/losses/rnnt_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@
class RNNTLossPytorch(Loss):
@property
def input_types(self):
"""Input types definitions for CTCLoss.
"""
"""Input types definitions for CTCLoss."""
return {
"acts": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()),
"labels": NeuralType(('B', 'T'), LabelsType()),
Expand Down Expand Up @@ -126,8 +125,7 @@ class TDTLossPytorch(Loss):

@property
def input_types(self):
"""Input types definitions for CTCLoss.
"""
"""Input types definitions for CTCLoss."""
return {
"acts": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()),
"labels": NeuralType(('B', 'T'), LabelsType()),
Expand Down Expand Up @@ -256,8 +254,7 @@ class MultiblankRNNTLossPytorch(Loss):

@property
def input_types(self):
"""Input types definitions for CTCLoss.
"""
"""Input types definitions for CTCLoss."""
return {
"acts": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()),
"labels": NeuralType(('B', 'T'), LabelsType()),
Expand Down
3 changes: 1 addition & 2 deletions nemo/collections/asr/losses/ssl_losses/ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
class CTCLossForSSL(Loss):
@property
def input_types(self):
"""Input types definitions for Contrastive.
"""
"""Input types definitions for Contrastive."""
return {
"spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()),
"decoder_outputs": NeuralType(("B", "T", "D"), VoidType()),
Expand Down
Loading
Loading