FFomy commited on
Commit
2a9b280
·
verified ·
1 Parent(s): cb8606e

Create Fun-ASR/model.py

Browse files
Files changed (1) hide show
  1. Fun-ASR/model.py +632 -0
Fun-ASR/model.py ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import random
5
+ import re
6
+ import string
7
+ import time
8
+ import traceback
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ from funasr import AutoModel
13
+ from funasr.metrics.compute_acc import compute_accuracy
14
+ from funasr.register import tables
15
+ from funasr.train_utils.device_funcs import force_gatherable, to_device
16
+ from funasr.utils.datadir_writer import DatadirWriter
17
+ from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video
18
+
19
+ dtype_map = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
20
+
21
+
22
+ @tables.register("model_classes", "FunASRNano")
23
+ class FunASRNano(nn.Module):
24
+ def __init__(
25
+ self,
26
+ audio_encoder: str = None,
27
+ audio_encoder_conf: dict = None,
28
+ audio_adaptor: str = None,
29
+ audio_adaptor_conf: dict = None,
30
+ llm: str = None,
31
+ llm_conf: dict = None,
32
+ input_size: int = 80,
33
+ length_normalized_loss: bool = False,
34
+ **kwargs,
35
+ ):
36
+ super().__init__()
37
+
38
+ # audio encoder
39
+ hub = audio_encoder_conf.get("hub", None)
40
+ self.audio_encoder_activation_checkpoint = audio_encoder_conf.get("activation_checkpoint", False)
41
+ if hub == "ms":
42
+ model = AutoModel(model=audio_encoder, model_revision="master")
43
+ audio_encoder_output_size = (
44
+ model.model.encoder_output_size if hasattr(model.model, "encoder_output_size") else -1
45
+ )
46
+ audio_encoder = model.model.model.encoder if hasattr(model.model, "model") else model.model.encoder
47
+ else:
48
+ encoder_class = tables.encoder_classes.get(audio_encoder)
49
+ audio_encoder = encoder_class(input_size=input_size, **audio_encoder_conf)
50
+ audio_encoder_output_size = audio_encoder.output_size()
51
+ freeze = audio_encoder_conf.get("freeze", True)
52
+ freeze_layer_num = int(audio_encoder_conf.get("freeze_layer_num", -1))
53
+
54
+ if freeze:
55
+ for name, param in audio_encoder.named_parameters():
56
+ param.requires_grad = False
57
+ audio_encoder.eval()
58
+ self.audio_encoder = audio_encoder
59
+ # llm
60
+ self.llm = None
61
+ init_param_path = llm_conf.get("init_param_path", None)
62
+ llm_dim = None
63
+
64
+ from transformers import AutoModelForCausalLM
65
+
66
+ llm_load_kwargs = llm_conf.get("load_kwargs", {})
67
+ model = AutoModelForCausalLM.from_pretrained(
68
+ init_param_path,
69
+ load_in_8bit=None,
70
+ device_map=None,
71
+ use_cache=None,
72
+ **llm_load_kwargs,
73
+ )
74
+
75
+ freeze = llm_conf.get("freeze", True)
76
+ if freeze:
77
+ for name, param in model.named_parameters():
78
+ param.requires_grad = False
79
+ model.eval()
80
+ logging.info(f"use_lora: {llm_conf.get('use_lora', False)}")
81
+ if llm_conf.get("use_lora", False):
82
+ from omegaconf import DictConfig, OmegaConf
83
+
84
+ lora_conf = llm_conf.get("lora_conf", {})
85
+ if isinstance(lora_conf, (OmegaConf, DictConfig)):
86
+ lora_conf = OmegaConf.to_container(lora_conf, resolve=True)
87
+ from peft import LoraConfig, PeftModel, get_peft_model
88
+
89
+ lora_init_param_path = lora_conf.get("init_param_path", None)
90
+ if lora_init_param_path is not None:
91
+ logging.info(f"lora_init_param_path: {lora_init_param_path}")
92
+ model = PeftModel.from_pretrained(model, lora_init_param_path)
93
+ for name, param in model.named_parameters():
94
+ if not lora_conf.get("freeze_lora", False):
95
+ if "lora_" in name:
96
+ param.requires_grad = True
97
+ else:
98
+ peft_config = LoraConfig(**lora_conf)
99
+ model = get_peft_model(model, peft_config)
100
+ model.print_trainable_parameters()
101
+
102
+ if llm_conf.get("activation_checkpoint", False):
103
+ model.gradient_checkpointing_enable()
104
+
105
+ self.llm_dtype = llm_conf.get("llm_dtype", "fp32")
106
+ self.llm = model.to(dtype_map[self.llm_dtype])
107
+ llm_dim = model.get_input_embeddings().weight.shape[-1]
108
+
109
+ # adaptor
110
+ adaptor_class = tables.adaptor_classes.get(audio_adaptor)
111
+ if audio_encoder_output_size > 0:
112
+ audio_adaptor_conf["encoder_dim"] = audio_encoder_output_size
113
+ audio_adaptor_conf["llm_dim"] = llm_dim if llm_dim is not None else audio_adaptor_conf["llm_dim"]
114
+ audio_adaptor = adaptor_class(**audio_adaptor_conf)
115
+ init_param_path = audio_adaptor_conf.get("init_param_path", None)
116
+ if init_param_path is not None:
117
+ src_state = torch.load(init_param_path, map_location="cpu")
118
+ flag = audio_adaptor.load_state_dict(src_state, strict=False)
119
+ logging.info(f"Loading audio_adaptor ckpt: {init_param_path}, status: {flag}")
120
+ freeze = audio_adaptor_conf.get("freeze", False)
121
+ if freeze:
122
+ for name, param in audio_adaptor.named_parameters():
123
+ param.requires_grad = False
124
+ audio_adaptor.eval()
125
+ self.audio_adaptor = audio_adaptor
126
+
127
+ self.length_normalized_loss = length_normalized_loss
128
+ self.feat_permute = audio_encoder_conf.get("feat_permute", True)
129
+ rank = int(os.environ.get("RANK", 0))
130
+ logging.info(f"rank: {rank}, model is builded.")
131
+
132
+ def forward(
133
+ self,
134
+ speech: torch.Tensor = None,
135
+ speech_lengths: torch.Tensor = None,
136
+ input_ids: torch.Tensor = None,
137
+ attention_mask: torch.Tensor = None,
138
+ labels_ids: torch.Tensor = None,
139
+ fbank_beg: torch.Tensor = None,
140
+ fbank_mask: torch.Tensor = None,
141
+ **kwargs,
142
+ ):
143
+ batch_size, token_num = input_ids.shape
144
+ stats = {}
145
+ input_ids[input_ids < 0] = 0
146
+ inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
147
+ if speech is not None:
148
+ if len(speech_lengths.size()) > 1:
149
+ speech_lengths = speech_lengths[:, 0]
150
+ batch_size_speech, frames, _ = speech.shape
151
+
152
+ # audio encoder
153
+ if self.audio_encoder_activation_checkpoint:
154
+ from torch.utils.checkpoint import checkpoint
155
+
156
+ encoder_out, encoder_out_lens = checkpoint(self.encode, speech, speech_lengths, use_reentrant=False)
157
+ else:
158
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
159
+
160
+ # audio_adaptor
161
+ encoder_out, encoder_out_lens = self.audio_adaptor(encoder_out, encoder_out_lens)
162
+
163
+ batch_size, token_num, dims = inputs_embeds.shape
164
+ fake_token_len = kwargs.get("fake_token_len")
165
+ fake_token_len[fake_token_len < 0] = 0
166
+ fbank_beg[fbank_beg < 0] = 0
167
+
168
+ speech_idx = 0
169
+ for batch_idx in range(batch_size):
170
+ for turn_id in range(fbank_beg.shape[1]):
171
+ fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
172
+ if fbank_beg_idx > 0:
173
+ speech_token_len = fake_token_len[batch_idx, turn_id]
174
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
175
+
176
+ try:
177
+ inputs_embeds[
178
+ batch_idx,
179
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
180
+ :,
181
+ ] = speech_token
182
+ except Exception as e:
183
+ logging.error(f"{str(e)}, {traceback.format_exc()}")
184
+ logging.info(
185
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
186
+ )
187
+ speech_token_len = encoder_out_lens[speech_idx].item()
188
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
189
+ inputs_embeds[
190
+ batch_idx,
191
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
192
+ :,
193
+ ] = speech_token
194
+
195
+ speech_idx += 1
196
+
197
+ stats["batch_size_speech"] = batch_size_speech
198
+ stats["batch_size_x_frames"] = frames * batch_size_speech
199
+ stats["batch_size_real_frames"] = speech_lengths.sum().item()
200
+ stats["padding_frames"] = stats["batch_size_x_frames"] - stats["batch_size_real_frames"]
201
+
202
+ with torch.cuda.amp.autocast(
203
+ enabled=True if self.llm_dtype != "fp32" else False,
204
+ dtype=dtype_map[self.llm_dtype],
205
+ ):
206
+ labels_ids[labels_ids == -1] = -100
207
+ attention_mask[attention_mask < 0] = 0
208
+ model_outputs = self.llm(
209
+ inputs_embeds=inputs_embeds.to(dtype_map[self.llm_dtype]),
210
+ attention_mask=attention_mask,
211
+ labels=labels_ids,
212
+ )
213
+ loss = model_outputs.loss
214
+
215
+ with torch.no_grad():
216
+ preds = torch.argmax(model_outputs.logits, -1)
217
+ acc_att = compute_accuracy(preds[:, :-1], labels_ids[:, 1:], ignore_label=-100)
218
+ stats["acc"] = acc_att
219
+
220
+ stats["loss"] = torch.clone(loss.detach())
221
+ stats["batch_size"] = batch_size
222
+
223
+ stats["batch_size_x_tokens"] = token_num * batch_size
224
+ stats["batch_size_real_tokens"] = attention_mask.sum().item()
225
+ stats["padding_tokens"] = stats["batch_size_x_tokens"] - stats["batch_size_real_tokens"]
226
+
227
+ dialog_turns = (fbank_beg > 0).sum(-1)
228
+ dialog_turns_max = torch.max(dialog_turns).int().item()
229
+ dialog_turns_avg = dialog_turns.sum().item() / batch_size
230
+ stats["dialog_turns_max"] = dialog_turns_max
231
+ stats["dialog_turns_avg"] = dialog_turns_avg
232
+
233
+ # force_gatherable: to-device and to-tensor if scalar for DataParallel
234
+ if self.length_normalized_loss:
235
+ batch_size = int((labels_ids > 0 + 1).sum())
236
+ loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
237
+ return loss, stats, weight
238
+
239
+ def forward_export(self, speech, speech_lengths, **kwargs):
240
+ x, olens = self.audio_encoder(speech, speech_lengths)
241
+ encoder_out, encoder_out_lens = self.audio_adaptor(x, olens)
242
+ return encoder_out, encoder_out_lens
243
+
244
+ def encode(self, speech, speech_lengths):
245
+ # audio encoder
246
+ if self.feat_permute:
247
+ encoder_out, encoder_out_lens = self.audio_encoder(speech.permute(0, 2, 1), speech_lengths)
248
+ else:
249
+ encoder_out, encoder_out_lens = self.audio_encoder(speech, speech_lengths)
250
+
251
+ return encoder_out, encoder_out_lens
252
+
253
+ def data_template(self, data):
254
+ system, user, assistant = [], [], []
255
+ for i, item in enumerate(data):
256
+ role = item["role"]
257
+ content = item["content"]
258
+ if role == "system":
259
+ system.append(content)
260
+ elif role == "user":
261
+ if "audio" in item:
262
+ audio = item["audio"]
263
+ content = [content, audio]
264
+ user.append(content)
265
+ elif role == "assistant":
266
+ assistant.append(content)
267
+
268
+ system = system * len(user)
269
+
270
+ contents = {
271
+ "system": system,
272
+ "user": user,
273
+ "assistant": assistant,
274
+ }
275
+
276
+ return contents
277
+
278
+ def data_load_speech(self, contents: dict, tokenizer, frontend, meta_data={}, **kwargs):
279
+ system = contents["system"]
280
+ user = contents["user"]
281
+ assistant = contents["assistant"]
282
+ pattern = re.compile(r"(<\|startofspeech\|>.*?<\|endofspeech\|>)")
283
+ do_think = True
284
+ sys_prompt = True
285
+ if "dataset_conf" in kwargs:
286
+ do_think = kwargs["dataset_conf"].get("do_think", True)
287
+ sys_prompt = kwargs["dataset_conf"].get("sys_prompt", True)
288
+
289
+ input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg, fake_token_len = (
290
+ [],
291
+ [],
292
+ [],
293
+ [],
294
+ [],
295
+ [],
296
+ [],
297
+ )
298
+ input_source_ids = []
299
+ for i, (system_prompt, user_prompt, target_out) in enumerate(zip(system, user, assistant)):
300
+ if i >= kwargs.get("multiturn_num_max", 5):
301
+ break
302
+ if len(input_ids) > kwargs.get("max_token_length", 1500):
303
+ break
304
+ if isinstance(user_prompt, (list, tuple)):
305
+ user_prompt, audio = user_prompt
306
+ if i == 0:
307
+ if kwargs.get("infer_with_assistant_input", False):
308
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}"
309
+ if not sys_prompt:
310
+ source_input = f"<|im_start|>user\n{user_prompt}"
311
+ else:
312
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
313
+ if not sys_prompt:
314
+ source_input = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
315
+ else:
316
+ if kwargs.get("infer_with_assistant_input", False):
317
+ source_input = f"<|im_start|>user\n{user_prompt}"
318
+ else:
319
+ source_input = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
320
+ if not do_think:
321
+ source_input += "<think>\n\n</think>\n\n"
322
+
323
+ splits = pattern.split(source_input)
324
+ source_ids = []
325
+ fbank_mask_i = []
326
+ fake_token_len_i = 0
327
+ fbank_beg_i = -1
328
+ speech, speech_lengths = [], []
329
+ for k, sub_str in enumerate(splits):
330
+ if not sub_str.startswith("<|startofspeech|>"):
331
+ sub_token = tokenizer.encode(sub_str)
332
+ source_ids += sub_token
333
+ fbank_mask_i += [0] * len(sub_token)
334
+ else:
335
+ sub_str = sub_str.replace("<|startofspeech|>", "").replace("<|endofspeech|>", "")
336
+ if sub_str.startswith("!"):
337
+ sub_str = sub_str[1:]
338
+ if sub_str.startswith("!"): # !!: audio sample point
339
+ sub_str = audio
340
+ try:
341
+ time1 = time.perf_counter()
342
+ data_src = load_audio_text_image_video(sub_str, fs=frontend.fs, **kwargs)
343
+ time2 = time.perf_counter()
344
+ meta_data["load_data"] = f"{time2 - time1:0.3f}"
345
+ except Exception as e:
346
+ logging.error(f"Loading wav failed! {str(e)}, {traceback.format_exc()}")
347
+
348
+ speech, speech_lengths = extract_fbank(
349
+ data_src,
350
+ data_type=kwargs.get("data_type", "sound"),
351
+ frontend=frontend,
352
+ is_final=True,
353
+ ) # speech: [b, T, d]
354
+
355
+ time3 = time.perf_counter()
356
+ meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
357
+ meta_data["batch_data_time"] = (
358
+ speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
359
+ )
360
+
361
+ if self.feat_permute:
362
+ speech = speech.permute(0, 2, 1)
363
+
364
+ olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
365
+ olens = 1 + (olens - 3 + 2 * 1) // 2
366
+ fake_token_len_i = (olens - 1) // 2 + 1
367
+ fake_token = [0] * fake_token_len_i
368
+ fbank_beg_i = len(source_ids)
369
+ source_ids += fake_token
370
+ fbank_mask_i += [1] * len(fake_token)
371
+
372
+ fbank_beg += [fbank_beg_i + len(input_ids)]
373
+ fake_token_len += [fake_token_len_i]
374
+ source_mask = [-100] * len(source_ids)
375
+ target_out = f"{target_out}<|im_end|>"
376
+ target_ids = tokenizer.encode(target_out)
377
+ input_source_ids = input_ids + source_ids
378
+ input_ids += source_ids + target_ids
379
+ labels += source_mask + target_ids
380
+ fbank_mask += fbank_mask_i
381
+ if len(speech) > 0:
382
+ fbank.append(speech[0, :, :])
383
+ fbank_lens.append(speech_lengths)
384
+
385
+ input_ids = torch.tensor(input_ids, dtype=torch.int64) # [: self.max_token_length]
386
+ attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
387
+ labels = torch.tensor(labels, dtype=torch.int64) # [: self.max_token_length]
388
+
389
+ fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
390
+ fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)
391
+ fake_token_len = torch.tensor(fake_token_len, dtype=torch.int32)
392
+ source_ids = torch.tensor(input_source_ids, dtype=torch.int64)
393
+ target_ids = torch.tensor(target_ids, dtype=torch.int64)
394
+
395
+ if len(fbank) > 0:
396
+ speech = torch.nn.utils.rnn.pad_sequence(fbank, batch_first=True, padding_value=0.0)
397
+ speech_lengths = torch.nn.utils.rnn.pad_sequence(fbank_lens, batch_first=True, padding_value=-1)
398
+ else:
399
+ speech = []
400
+ speech_lengths = []
401
+ output = {
402
+ "speech": speech,
403
+ "speech_lengths": speech_lengths,
404
+ "fbank_mask": fbank_mask[None, :],
405
+ "fbank_beg": fbank_beg[None,],
406
+ "fake_token_len": fake_token_len[None, :],
407
+ "input_ids": input_ids[None,],
408
+ "attention_mask": attention_mask[None,],
409
+ "labels_ids": labels,
410
+ "source_ids": source_ids[None, :],
411
+ "target_ids": target_ids[None, :],
412
+ }
413
+
414
+ return output
415
+
416
+ def inference_prepare(
417
+ self,
418
+ data_in,
419
+ data_lengths=None,
420
+ key: list = None,
421
+ tokenizer=None,
422
+ frontend=None,
423
+ **kwargs,
424
+ ):
425
+ meta_data = {}
426
+
427
+ if kwargs.get("batch_size", 1) > 1:
428
+ raise NotImplementedError("batch decoding is not implemented")
429
+
430
+ contents = self.data_template(data_in[0])
431
+ output = self.data_load_speech(contents, tokenizer, frontend, meta_data=meta_data, **kwargs)
432
+ batch = to_device(output, kwargs["device"])
433
+
434
+ # audio encoder
435
+ speech = batch["speech"]
436
+
437
+ if len(speech) > 0:
438
+ if "audio_embedding" in kwargs and "audio_embedding_lens" in kwargs:
439
+ encoder_out = kwargs["audio_embedding"]
440
+ encoder_out_lens = kwargs["audio_embedding_lens"]
441
+ else:
442
+ speech_lengths = batch["speech_lengths"][:, 0]
443
+ # fp16
444
+ if kwargs.get("fp16", False):
445
+ speech = speech.to(torch.float16)
446
+ elif kwargs.get("bf16", False):
447
+ speech = speech.to(torch.bfloat16)
448
+ # audio encoder
449
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
450
+
451
+ # audio_adaptor
452
+ encoder_out, encoder_out_lens = self.audio_adaptor(encoder_out, encoder_out_lens)
453
+ meta_data["audio_adaptor_out"] = encoder_out
454
+ meta_data["audio_adaptor_out_lens"] = encoder_out_lens
455
+
456
+ input_ids = batch["input_ids"]
457
+ source_ids = batch["source_ids"]
458
+ fbank_beg = batch["fbank_beg"]
459
+ fake_token_len = batch["fake_token_len"]
460
+
461
+ if not kwargs.get("tearchforing", False):
462
+ input_ids = source_ids
463
+
464
+ input_ids[input_ids < 0] = 0
465
+ inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
466
+
467
+ batch_size, token_num, dims = inputs_embeds.shape
468
+
469
+ fake_token_len[fake_token_len < 0] = 0
470
+ fbank_beg[fbank_beg < 0] = 0
471
+
472
+ speech_idx = 0
473
+ for batch_idx in range(batch_size):
474
+ for turn_id in range(fbank_beg.shape[1]):
475
+ fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
476
+ if fbank_beg_idx > 0:
477
+ speech_token_len = fake_token_len[batch_idx, turn_id]
478
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
479
+
480
+ try:
481
+ inputs_embeds[
482
+ batch_idx,
483
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
484
+ :,
485
+ ] = speech_token
486
+ except Exception as e:
487
+ #
488
+ logging.error(f"{str(e)}, {traceback.format_exc()}")
489
+ logging.info(
490
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
491
+ )
492
+ speech_token_len = encoder_out_lens[speech_idx].item()
493
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
494
+ inputs_embeds[
495
+ batch_idx,
496
+ fbank_beg_idx : fbank_beg_idx + speech_token_len,
497
+ :,
498
+ ] = speech_token
499
+
500
+ speech_idx += 1
501
+ return inputs_embeds, contents, batch, source_ids, meta_data
502
+
503
+ def inference(
504
+ self,
505
+ data_in,
506
+ data_lengths=None,
507
+ key: list = None,
508
+ tokenizer=None,
509
+ frontend=None,
510
+ **kwargs,
511
+ ):
512
+ new_data_in = []
513
+ for data in data_in:
514
+ if isinstance(data, str):
515
+ new_data_in.append(
516
+ [
517
+ {"role": "system", "content": "You are a helpful assistant."},
518
+ {"role": "user", "content": f"语音转写:<|startofspeech|>!{data}<|endofspeech|>"},
519
+ {"role": "assistant", "content": "null"},
520
+ ]
521
+ )
522
+ elif isinstance(data, torch.Tensor):
523
+ new_data_in.append(
524
+ [
525
+ {"role": "system", "content": "You are a helpful assistant."},
526
+ {"role": "user", "content": f"语音转写:<|startofspeech|>!!<|endofspeech|>", "audio": data},
527
+ {"role": "assistant", "content": "null"},
528
+ ]
529
+ )
530
+ data_in = new_data_in
531
+
532
+ if key is None:
533
+ key = []
534
+ for _ in data_in:
535
+ chars = string.ascii_letters + string.digits
536
+ key.append("rand_key_" + "".join(random.choice(chars) for _ in range(13)))
537
+
538
+ return self.inference_llm(
539
+ data_in,
540
+ data_lengths=data_lengths,
541
+ key=key,
542
+ tokenizer=tokenizer,
543
+ frontend=frontend,
544
+ **kwargs,
545
+ )
546
+
547
+ def inference_llm(
548
+ self,
549
+ data_in,
550
+ data_lengths=None,
551
+ key: list = None,
552
+ tokenizer=None,
553
+ frontend=None,
554
+ **kwargs,
555
+ ):
556
+ inputs_embeds, contents, batch, source_ids, meta_data = self.inference_prepare(
557
+ data_in, data_lengths, key, tokenizer, frontend, **kwargs
558
+ )
559
+ llm_dtype = kwargs.get("llm_dtype", "fp32")
560
+ if llm_dtype == "fp32":
561
+ llm_dtype = "fp16" if kwargs.get("fp16", False) else llm_dtype
562
+ llm_dtype = "bf16" if kwargs.get("bf16", False) else llm_dtype
563
+
564
+ with torch.cuda.amp.autocast(enabled=True if llm_dtype != "fp32" else False, dtype=dtype_map[llm_dtype]):
565
+ label = contents["assistant"][-1]
566
+ self.llm = self.llm.to(dtype_map[llm_dtype])
567
+ inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])
568
+ llm_kwargs = kwargs.get("llm_kwargs", {})
569
+ if not kwargs.get("teachforing", False):
570
+ generated_ids = self.llm.generate(
571
+ inputs_embeds=inputs_embeds,
572
+ max_new_tokens=kwargs.get("max_length", 512),
573
+ **llm_kwargs,
574
+ )
575
+
576
+ response = tokenizer.batch_decode(
577
+ generated_ids,
578
+ skip_special_tokens=kwargs.get("skip_special_tokens", True),
579
+ )[0]
580
+
581
+ loss = None
582
+ else:
583
+ labels_ids = batch["labels_ids"]
584
+ labels_ids[labels_ids == -1] = -100
585
+ attention_mask = batch.get("attention_mask", None)
586
+ model_outputs = self.llm(
587
+ inputs_embeds=inputs_embeds,
588
+ attention_mask=attention_mask,
589
+ labels=labels_ids,
590
+ **llm_kwargs,
591
+ )
592
+
593
+ preds = torch.argmax(model_outputs.logits, -1)[:, source_ids.shape[1] :]
594
+ response = tokenizer.batch_decode(
595
+ preds,
596
+ add_special_tokens=False,
597
+ skip_special_tokens=kwargs.get("skip_special_tokens", True),
598
+ )[0]
599
+ loss = model_outputs.loss.item()
600
+
601
+ ibest_writer = None
602
+ if kwargs.get("output_dir") is not None:
603
+ if not hasattr(self, "writer"):
604
+ self.writer = DatadirWriter(kwargs.get("output_dir"))
605
+ ibest_writer = self.writer[f"{0 + 1}best_recog"]
606
+
607
+ results = []
608
+ response_clean = re.sub(r"[^\w\s\u3000\u4e00-\u9fff]+", "", response)
609
+ result_i = {
610
+ "key": key[0],
611
+ "text": response,
612
+ "text_tn": response_clean,
613
+ "label": label,
614
+ }
615
+ if loss is not None:
616
+ result_i["loss"] = loss
617
+ results.append(result_i)
618
+
619
+ if ibest_writer is not None:
620
+ ibest_writer["text"][key[0]] = response.replace("\n", " ")
621
+ ibest_writer["label"][key[0]] = label.replace("\n", " ")
622
+ ibest_writer["text_tn"][key[0]] = response_clean
623
+
624
+ return results, meta_data
625
+
626
+ @staticmethod
627
+ def from_pretrained(model: str = None, **kwargs):
628
+ from funasr import AutoModel
629
+
630
+ model, kwargs = AutoModel.build_model(model=model, trust_remote_code=True, **kwargs)
631
+
632
+ return model, kwargs