Skip to content

vllm.model_executor.models.cohere_asr

CausalConv1D

Bases: Conv1d

A causal version of nn.Conv1d where each step would have limited access to locations on its right or left. All arguments are the same as nn.Conv1d except padding.

If padding is set None, then paddings are set automatically to make it a causal convolution where each location would not see any steps on its right.

If padding is set as a list (size of 2), then padding[0] would be used as left padding and padding[1] as right padding. It would make it possible to control the number of steps to be accessible on the right and left. This mode is not supported when stride > 1. padding[0]+padding[1] should be equal to (kernel_size - 1).

Source code in vllm/model_executor/models/cohere_asr.py
class CausalConv1D(nn.Conv1d):
    """
    A causal version of nn.Conv1d where each step would
    have limited access to locations on its right or left.
    All arguments are the same as nn.Conv1d except padding.

    If padding is set None, then paddings are set
    automatically to make it a causal convolution where
    each location would not see any steps on its right.

    If padding is set as a list (size of 2), then
    padding[0] would be used as left padding and
    padding[1] as right padding. It would make it possible
    to control the number of steps to be accessible on the
    right and left. This mode is not supported when
    stride > 1. padding[0]+padding[1] should be equal to
    (kernel_size - 1).
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: str | int = 0,
        dilation: int = 1,
        groups: int = 1,
        bias: bool = True,
        padding_mode: str = "zeros",
        device=None,
        dtype=None,
    ) -> None:
        if padding is None:
            self._left_padding = kernel_size - 1
            self._right_padding = stride - 1
        else:
            if stride != 1 and padding != kernel_size - 1:
                raise ValueError("No striding allowed for non-symmetric convolutions!")
            if isinstance(padding, int):
                self._left_padding = padding
                self._right_padding = padding
            elif (
                isinstance(padding, list)
                and len(padding) == 2
                and padding[0] + padding[1] == kernel_size - 1
            ):
                self._left_padding = padding[0]
                self._right_padding = padding[1]
            else:
                raise ValueError(f"Invalid padding param: {padding}!")

        super().__init__(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=0,
            dilation=dilation,
            groups=groups,
            bias=bias,
            padding_mode=padding_mode,
            device=device,
            dtype=dtype,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.pad(x, pad=(self._left_padding, self._right_padding))
        return super().forward(x)

CohereASRMultiHeadAttention

Bases: Module

Multi-Head Attention layer of Transformer. Args: n_head (int): number of heads n_feat (int): size of the features use_bias (bool): whether to remove bias in linear and conv layers

Source code in vllm/model_executor/models/cohere_asr.py
class CohereASRMultiHeadAttention(nn.Module):
    """Multi-Head Attention layer of Transformer.
    Args:
        n_head (int): number of heads
        n_feat (int): size of the features
        use_bias (bool): whether to remove bias in linear and conv layers
    """

    def __init__(
        self,
        n_head: int,
        n_feat: int,
        use_bias: bool = True,
    ) -> None:
        """Construct an MultiHeadedAttention object."""
        super().__init__()

        assert n_feat % n_head == 0
        self.d_k = n_feat // n_head
        self.s_d_k = math.sqrt(self.d_k)
        self.h = n_head
        self.linear_q = nn.Linear(n_feat, n_feat, bias=use_bias)
        self.linear_k = nn.Linear(n_feat, n_feat, bias=use_bias)
        self.linear_v = nn.Linear(n_feat, n_feat, bias=use_bias)
        self.linear_out = nn.Linear(n_feat, n_feat, bias=use_bias)

    def forward_qkv(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Transforms query, key and value.
        Args:
            query (torch.Tensor): (batch, time1, size)
            key (torch.Tensor): (batch, time2, size)
            value (torch.Tensor): (batch, time2, size)
        returns:
            q (torch.Tensor): (batch, head, time1, size)
            k (torch.Tensor): (batch, head, time2, size)
            v (torch.Tensor): (batch, head, time2, size)
        """
        n_batch = query.size(0)
        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        return q, k, v

    def forward_attention(
        self,
        value: torch.Tensor,
        scores: torch.Tensor,
        mask: torch.Tensor | None,
    ) -> torch.Tensor:
        """Compute attention context vector.
        Args:
            value (torch.Tensor): (batch, time2, size)
            scores(torch.Tensor): (batch, time1, time2)
            mask(torch.Tensor): (batch, time1, time2)
        returns:
            value (torch.Tensor): transformed `value`
                (batch, time2, d_model) weighted by the
                attention scores
        """
        n_batch = value.size(0)
        if mask is not None:
            mask = mask.unsqueeze(1)  # (batch, 1, time1, time2)
            scores = scores.masked_fill(mask, -INF_VAL)
            attn = torch.softmax(scores, dim=-1).masked_fill(
                mask, 0.0
            )  # (batch, head, time1, time2)
        else:
            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)

        x = torch.matmul(attn, value)  # (batch, head, time1, d_k)
        x = x.transpose(1, 2).reshape(
            n_batch, -1, self.h * self.d_k
        )  # (batch, time1, d_model)

        return self.linear_out(x)  # (batch, time1, d_model)

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: torch.Tensor | None,
        pos_emb: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """Compute 'Scaled Dot Product Attention'.
        Args:
            query (torch.Tensor): (batch, time1, size)
            key (torch.Tensor): (batch, time2, size)
            value(torch.Tensor): (batch, time2, size)
            mask (torch.Tensor): (batch, time1, time2)

        returns:
            output (torch.Tensor): transformed `value`
                (batch, time1, d_model) weighted by the
                query dot key attention
        """
        q, k, v = self.forward_qkv(query, key, value)

        scores = torch.matmul(q, k.transpose(-2, -1)) / self.s_d_k
        return self.forward_attention(v, scores, mask)

__init__

__init__(
    n_head: int, n_feat: int, use_bias: bool = True
) -> None

Construct an MultiHeadedAttention object.

Source code in vllm/model_executor/models/cohere_asr.py
def __init__(
    self,
    n_head: int,
    n_feat: int,
    use_bias: bool = True,
) -> None:
    """Construct an MultiHeadedAttention object."""
    super().__init__()

    assert n_feat % n_head == 0
    self.d_k = n_feat // n_head
    self.s_d_k = math.sqrt(self.d_k)
    self.h = n_head
    self.linear_q = nn.Linear(n_feat, n_feat, bias=use_bias)
    self.linear_k = nn.Linear(n_feat, n_feat, bias=use_bias)
    self.linear_v = nn.Linear(n_feat, n_feat, bias=use_bias)
    self.linear_out = nn.Linear(n_feat, n_feat, bias=use_bias)

forward

forward(
    query: Tensor,
    key: Tensor,
    value: Tensor,
    mask: Tensor | None,
    pos_emb: Tensor | None = None,
) -> Tensor

Compute 'Scaled Dot Product Attention'. Args: query (torch.Tensor): (batch, time1, size) key (torch.Tensor): (batch, time2, size) value(torch.Tensor): (batch, time2, size) mask (torch.Tensor): (batch, time1, time2)

Returns:

Name Type Description
output Tensor

transformed value (batch, time1, d_model) weighted by the query dot key attention

Source code in vllm/model_executor/models/cohere_asr.py
def forward(
    self,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: torch.Tensor | None,
    pos_emb: torch.Tensor | None = None,
) -> torch.Tensor:
    """Compute 'Scaled Dot Product Attention'.
    Args:
        query (torch.Tensor): (batch, time1, size)
        key (torch.Tensor): (batch, time2, size)
        value(torch.Tensor): (batch, time2, size)
        mask (torch.Tensor): (batch, time1, time2)

    returns:
        output (torch.Tensor): transformed `value`
            (batch, time1, d_model) weighted by the
            query dot key attention
    """
    q, k, v = self.forward_qkv(query, key, value)

    scores = torch.matmul(q, k.transpose(-2, -1)) / self.s_d_k
    return self.forward_attention(v, scores, mask)

forward_attention

forward_attention(
    value: Tensor, scores: Tensor, mask: Tensor | None
) -> Tensor

Compute attention context vector. Args: value (torch.Tensor): (batch, time2, size) scores(torch.Tensor): (batch, time1, time2) mask(torch.Tensor): (batch, time1, time2) returns: value (torch.Tensor): transformed value (batch, time2, d_model) weighted by the attention scores

Source code in vllm/model_executor/models/cohere_asr.py
def forward_attention(
    self,
    value: torch.Tensor,
    scores: torch.Tensor,
    mask: torch.Tensor | None,
) -> torch.Tensor:
    """Compute attention context vector.
    Args:
        value (torch.Tensor): (batch, time2, size)
        scores(torch.Tensor): (batch, time1, time2)
        mask(torch.Tensor): (batch, time1, time2)
    returns:
        value (torch.Tensor): transformed `value`
            (batch, time2, d_model) weighted by the
            attention scores
    """
    n_batch = value.size(0)
    if mask is not None:
        mask = mask.unsqueeze(1)  # (batch, 1, time1, time2)
        scores = scores.masked_fill(mask, -INF_VAL)
        attn = torch.softmax(scores, dim=-1).masked_fill(
            mask, 0.0
        )  # (batch, head, time1, time2)
    else:
        attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)

    x = torch.matmul(attn, value)  # (batch, head, time1, d_k)
    x = x.transpose(1, 2).reshape(
        n_batch, -1, self.h * self.d_k
    )  # (batch, time1, d_model)

    return self.linear_out(x)  # (batch, time1, d_model)

forward_qkv

forward_qkv(
    query: Tensor, key: Tensor, value: Tensor
) -> tuple[Tensor, Tensor, Tensor]

Transforms query, key and value. Args: query (torch.Tensor): (batch, time1, size) key (torch.Tensor): (batch, time2, size) value (torch.Tensor): (batch, time2, size) returns: q (torch.Tensor): (batch, head, time1, size) k (torch.Tensor): (batch, head, time2, size) v (torch.Tensor): (batch, head, time2, size)

Source code in vllm/model_executor/models/cohere_asr.py
def forward_qkv(
    self,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Transforms query, key and value.
    Args:
        query (torch.Tensor): (batch, time1, size)
        key (torch.Tensor): (batch, time2, size)
        value (torch.Tensor): (batch, time2, size)
    returns:
        q (torch.Tensor): (batch, head, time1, size)
        k (torch.Tensor): (batch, head, time2, size)
        v (torch.Tensor): (batch, head, time2, size)
    """
    n_batch = query.size(0)
    q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
    k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
    v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
    q = q.transpose(1, 2)
    k = k.transpose(1, 2)
    v = v.transpose(1, 2)

    return q, k, v

ConformerConvolution

Bases: Module

The convolution module for the Conformer model. Args: d_model (int): hidden dimension kernel_size (int): kernel size for depthwise convolution pointwise_activation (str): name of the activation function to be used for the pointwise conv. Note that Conformer uses a special key glu_ which is treated as the original default from the paper. use_bias (bool): Use bias in all Linear and Conv1d layers to improve activation flow and stabilize training of huge models. Defaults to True

Source code in vllm/model_executor/models/cohere_asr.py
class ConformerConvolution(nn.Module):
    """The convolution module for the Conformer model.
    Args:
        d_model (int): hidden dimension
        kernel_size (int): kernel size for depthwise convolution
        pointwise_activation (str): name of the activation
            function to be used for the pointwise conv.
            Note that Conformer uses a special key `glu_`
            which is treated as the original default from
            the paper.
        use_bias (bool): Use bias in all Linear and Conv1d
            layers to improve activation flow and stabilize
            training of huge models. Defaults to True
    """

    def __init__(
        self,
        d_model: int,
        kernel_size: int,
        norm_type: str = "batch_norm",
        conv_context_size: int | None = None,
        pointwise_activation: str = "glu_",
        use_bias: bool = True,
    ) -> None:
        super().__init__()
        assert (kernel_size - 1) % 2 == 0

        if conv_context_size is None:
            conv_context_size = (kernel_size - 1) // 2

        assert pointwise_activation == "glu_"
        dw_conv_input_dim = d_model

        self.pointwise_conv1 = nn.Conv1d(
            in_channels=d_model,
            out_channels=d_model * 2,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=use_bias,
        )

        self.depthwise_conv = CausalConv1D(
            in_channels=dw_conv_input_dim,
            out_channels=dw_conv_input_dim,
            kernel_size=kernel_size,
            stride=1,
            padding=conv_context_size,
            groups=dw_conv_input_dim,
            bias=use_bias,
        )

        assert norm_type == "batch_norm"
        self.batch_norm = nn.BatchNorm1d(dw_conv_input_dim)

        self.activation = Swish()
        self.pointwise_conv2 = nn.Conv1d(
            in_channels=dw_conv_input_dim,
            out_channels=d_model,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=use_bias,
        )

    def forward(
        self, x: torch.Tensor, pad_mask: torch.Tensor | None = None
    ) -> torch.Tensor:
        x = x.transpose(1, 2)
        x = self.pointwise_conv1(x)

        x = nn.functional.glu(x, dim=1)

        if pad_mask is not None:
            x = x.masked_fill(pad_mask.unsqueeze(1), 0.0)

        x = self.depthwise_conv(x)

        x = self.batch_norm(x)

        x = self.activation(x)
        x = self.pointwise_conv2(x)
        x = x.transpose(1, 2)
        return x

ConformerEncoder

Bases: Module

The encoder for ASR model of Conformer. Based on this paper: 'Conformer: Convolution-augmented Transformer for Speech Recognition' by Anmol Gulati et al. https://arxiv.org/abs/2005.08100

Source code in vllm/model_executor/models/cohere_asr.py
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
class ConformerEncoder(nn.Module):
    """
    The encoder for ASR model of Conformer.
    Based on this paper:
    'Conformer: Convolution-augmented Transformer for
    Speech Recognition' by Anmol Gulati et al.
    https://arxiv.org/abs/2005.08100
    """

    def __init__(self, *, vllm_config: VllmConfig):
        super().__init__()

        self.hf_config = vllm_config.model_config.hf_config

        feat_in = self.hf_config.encoder["feat_in"]
        n_layers = self.hf_config.encoder["n_layers"]
        d_model = self.hf_config.encoder["d_model"]
        feat_out = self.hf_config.encoder["feat_out"]
        causal_downsampling = self.hf_config.encoder["causal_downsampling"]
        subsampling = self.hf_config.encoder["subsampling"]
        subsampling_factor = self.hf_config.encoder["subsampling_factor"]
        subsampling_conv_chunking_factor = self.hf_config.encoder.get(
            "subsampling_conv_chunking_factor", 1
        )
        subsampling_conv_channels = self.hf_config.encoder["subsampling_conv_channels"]
        ff_expansion_factor = self.hf_config.encoder["ff_expansion_factor"]
        self_attention_model = self.hf_config.encoder["self_attention_model"]
        n_heads = self.hf_config.encoder["n_heads"]
        att_context_size = self.hf_config.encoder["att_context_size"]
        att_context_probs = self.hf_config.encoder.get("att_context_probs", None)
        att_context_style = self.hf_config.encoder.get("att_context_style", "regular")
        xscaling = self.hf_config.encoder["xscaling"]
        untie_biases = self.hf_config.encoder["untie_biases"]
        pos_emb_max_len = self.hf_config.encoder["pos_emb_max_len"]
        conv_kernel_size = self.hf_config.encoder["conv_kernel_size"]
        conv_norm_type = self.hf_config.encoder["conv_norm_type"]
        conv_context_size = self.hf_config.encoder["conv_context_size"]
        use_bias = self.hf_config.encoder.get("use_bias", True)

        d_ff = d_model * ff_expansion_factor
        self.d_model = d_model
        self._feat_in = feat_in
        self.att_context_style = att_context_style
        self.subsampling_factor = subsampling_factor

        self.self_attention_model = self_attention_model

        # Setting up the att_context_size
        (
            _,
            self.att_context_size,
            _,
            self.conv_context_size,
        ) = self._calc_context_sizes(
            att_context_style=att_context_style,
            att_context_size=att_context_size,
            att_context_probs=att_context_probs,
            conv_context_size=conv_context_size,
            conv_kernel_size=conv_kernel_size,
        )

        if xscaling:
            self.xscale = math.sqrt(d_model)
        else:
            self.xscale = None

        # Subsampling
        if subsampling_conv_channels == -1:
            subsampling_conv_channels = d_model
        assert subsampling and subsampling_factor > 1 and subsampling == "dw_striding"

        self.pre_encode = ConvSubsampling(
            subsampling=subsampling,
            subsampling_factor=subsampling_factor,
            feat_in=feat_in,
            feat_out=d_model,
            conv_channels=subsampling_conv_channels,
            subsampling_conv_chunking_factor=subsampling_conv_chunking_factor,
            activation=nn.ReLU(True),
            is_causal=causal_downsampling,
        )

        self._feat_out = d_model

        # Biases for relative positional encoding
        if not untie_biases and self_attention_model == "rel_pos":
            d_head = d_model // n_heads
            # Register as buffers instead of parameters since they're not trainable
            # and need to respect dtype during weight loading
            self.register_buffer(
                "pos_bias_u", torch.zeros(n_heads, d_head), persistent=True
            )
            self.register_buffer(
                "pos_bias_v", torch.zeros(n_heads, d_head), persistent=True
            )
            pos_bias_u = self.pos_bias_u
            pos_bias_v = self.pos_bias_v
        else:
            pos_bias_u = None
            pos_bias_v = None

        # Positional encodings
        self.pos_emb_max_len = pos_emb_max_len
        assert self_attention_model == "rel_pos"
        self.pos_enc = RelPositionalEncoding(
            d_model=d_model,
            max_len=pos_emb_max_len,
            xscale=self.xscale,
        )

        self.layers = nn.ModuleList()
        for i in range(n_layers):
            layer = ConformerLayer(
                d_model=d_model,
                d_ff=d_ff,
                self_attention_model=self_attention_model,
                n_heads=n_heads,
                conv_kernel_size=conv_kernel_size,
                conv_norm_type=conv_norm_type,
                conv_context_size=self.conv_context_size,
                pos_bias_u=pos_bias_u,
                pos_bias_v=pos_bias_v,
                att_context_size=self.att_context_size,
                use_bias=use_bias,
            )
            self.layers.append(layer)

        if feat_out > 0 and feat_out != self._feat_out:
            self.out_proj = nn.Linear(self._feat_out, feat_out)
            self._feat_out = feat_out
        else:
            self.out_proj = None
            self._feat_out = d_model
        self.set_max_audio_length(self.pos_emb_max_len)

    def get_num_encoder_cross_attn_tokens(self, num_encoder_input_tokens: int) -> int:
        num_encoder_cross_attn_tokens = math.ceil(
            num_encoder_input_tokens / self.subsampling_factor
        )
        return num_encoder_cross_attn_tokens

    def set_max_audio_length(self, max_audio_length: int) -> None:
        """
        Sets maximum input length.
        Pre-calculates internal seq_range mask.

        Args:
            max_audio_length (int): New maximum sequence length.
        """
        device = next(self.parameters()).device
        dtype = next(self.parameters()).dtype
        self.pos_enc.extend_pe(max_audio_length, device, dtype)

    def forward(
        self,
        audio_signal: torch.Tensor,
        length: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if audio_signal.shape[-2] != self._feat_in:
            raise ValueError(
                f"audio_signal should have shape "
                f"(batch, {self._feat_in}, n_frame) but "
                f"got last dimension "
                f"{audio_signal.shape[-2]}."
            )

        return self.forward_internal(
            audio_signal,
            length,
        )

    def forward_internal(
        self,
        audio_signal: torch.Tensor,
        length: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if length is None:
            length = audio_signal.new_full(
                (audio_signal.size(0),),
                audio_signal.size(-1),
                dtype=torch.int64,
                device=audio_signal.device,
            )

        cur_att_context_size = self.att_context_size
        audio_signal = torch.transpose(audio_signal, 1, 2)

        audio_signal, length = self.pre_encode(x=audio_signal, lengths=length)
        length = length.to(torch.int64)

        max_audio_length = audio_signal.size(1)

        padding_length = length

        audio_signal, pos_emb = self.pos_enc(x=audio_signal, cache_len=0)

        pad_mask, att_mask = self._create_masks(
            att_context_size=cur_att_context_size,
            padding_length=padding_length,
            max_audio_length=max_audio_length,
            offset=None,
            device=audio_signal.device,
        )

        for lth, layer in enumerate(self.layers):
            audio_signal = layer(
                x=audio_signal,
                att_mask=att_mask,
                pos_emb=pos_emb,
                pad_mask=pad_mask,
            )

        if self.out_proj is not None:
            audio_signal = self.out_proj(audio_signal)

        audio_signal = torch.transpose(audio_signal, 1, 2)
        length = length.to(dtype=torch.int64)

        return audio_signal, length

    def _create_masks(
        self,
        att_context_size: list[int],
        padding_length: torch.Tensor,
        max_audio_length: int,
        offset: torch.Tensor | None,
        device: torch.device,
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
        if self.self_attention_model != "rel_pos_local_attn":
            att_mask = torch.ones(
                1, max_audio_length, max_audio_length, dtype=torch.bool, device=device
            )

            if self.att_context_style == "regular":
                if att_context_size[0] >= 0:
                    att_mask = att_mask.triu(diagonal=-att_context_size[0])
                if att_context_size[1] >= 0:
                    att_mask = att_mask.tril(diagonal=att_context_size[1])
            elif self.att_context_style == "chunked_limited":
                # When right context is unlimited, just the
                # left side of masking needs to get updated
                if att_context_size[1] == -1:
                    if att_context_size[0] >= 0:
                        att_mask = att_mask.triu(diagonal=-att_context_size[0])
                else:
                    chunk_size = att_context_size[1] + 1
                    # left_chunks_num specifies the number
                    # of chunks to be visible by each chunk
                    # on the left side
                    if att_context_size[0] >= 0:
                        left_chunks_num = att_context_size[0] // chunk_size
                    else:
                        left_chunks_num = 10000

                    chunk_idx = torch.arange(
                        0, max_audio_length, dtype=torch.int, device=att_mask.device
                    )
                    chunk_idx = torch.div(chunk_idx, chunk_size, rounding_mode="trunc")
                    diff_chunks = chunk_idx.unsqueeze(1) - chunk_idx.unsqueeze(0)
                    chunked_limited_mask = torch.logical_and(
                        torch.le(diff_chunks, left_chunks_num), torch.ge(diff_chunks, 0)
                    )
                    att_mask = torch.logical_and(
                        att_mask, chunked_limited_mask.unsqueeze(0)
                    )
        else:
            att_mask = None

        # pad_mask is the masking to be used to ignore paddings
        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
            padding_length.size(0), -1
        ) < padding_length.unsqueeze(-1)

        if offset is not None:
            pad_mask_off = torch.arange(0, max_audio_length, device=device).expand(
                padding_length.size(0), -1
            ) >= offset.unsqueeze(-1)
            pad_mask = pad_mask_off.logical_and(pad_mask)

        if att_mask is not None:
            # pad_mask_for_att_mask is the mask which helps to ignore paddings
            pad_mask_for_att_mask = pad_mask.unsqueeze(1).repeat(
                [1, max_audio_length, 1]
            )
            pad_mask_for_att_mask = torch.logical_and(
                pad_mask_for_att_mask, pad_mask_for_att_mask.transpose(1, 2)
            )
            # att_mask is the masking to be used by MHA
            # layers to ignore tokens not supposed to be
            # visible
            att_mask = att_mask[:, :max_audio_length, :max_audio_length]
            # paddings should also get ignored, so
            # pad_mask_for_att_mask is used to ignore their
            # corresponding scores
            att_mask = torch.logical_and(
                pad_mask_for_att_mask, att_mask.to(pad_mask_for_att_mask.device)
            )
            att_mask = ~att_mask

        pad_mask = ~pad_mask
        return pad_mask, att_mask

    def _calc_context_sizes(
        self,
        att_context_size: list[int] | list[list[int]] | None,
        att_context_probs: list[float] | None,
        att_context_style: str,
        conv_context_size: list[int] | str | None,
        conv_kernel_size: int,
    ) -> tuple[list[list[int]], list[int], list[float], list[int]]:
        # convert att_context_size to a standard list of lists
        if att_context_size:
            att_context_size_all = list(att_context_size)
            if isinstance(att_context_size_all[0], int):
                att_context_size_all = [att_context_size_all]
            for i, att_cs in enumerate(att_context_size_all):
                if att_context_style == "chunked_limited":
                    if att_cs[0] > 0 and att_cs[0] % (att_cs[1] + 1) > 0:
                        raise ValueError(
                            f"att_context_size[{i}][0] % "
                            f"(att_context_size[{i}][1]"
                            f" + 1) should be zero!"
                        )
                    if att_cs[1] < 0 and len(att_context_size_all) <= 1:
                        raise ValueError(
                            f"Right context "
                            f"(att_context_size[{i}][1])"
                            f" can not be unlimited for"
                            f" chunked_limited style!"
                        )
        else:
            att_context_size_all = [[-1, -1]]

        if att_context_probs:
            if len(att_context_probs) != len(att_context_size_all):
                raise ValueError(
                    "The size of the att_context_probs "
                    "should be the same as att_context_size."
                )
            att_context_probs = list(att_context_probs)
            if sum(att_context_probs) != 1:
                raise ValueError(
                    "The sum of numbers in "
                    "att_context_probs should be equal "
                    "to one to be a distribution."
                )
        else:
            att_context_probs = [1.0 / len(att_context_size_all)] * len(
                att_context_size_all
            )

        if conv_context_size is not None:
            if not isinstance(conv_context_size, list) and not isinstance(
                conv_context_size, str
            ):
                raise ValueError(
                    "Invalid conv_context_size! It should "
                    "be the string 'causal' or a list of "
                    "two integers."
                )
            if conv_context_size == "causal":
                conv_context_size = [conv_kernel_size - 1, 0]
            else:
                total = conv_context_size[0] + conv_context_size[1] + 1
                if total != conv_kernel_size:
                    raise ValueError(
                        f"Invalid conv_context_size: {self.conv_context_size}!"
                    )
        else:
            conv_context_size = [
                (conv_kernel_size - 1) // 2,
                (conv_kernel_size - 1) // 2,
            ]
        return (
            att_context_size_all,
            att_context_size_all[0],
            att_context_probs,
            conv_context_size,
        )

set_max_audio_length

set_max_audio_length(max_audio_length: int) -> None

Sets maximum input length. Pre-calculates internal seq_range mask.

Parameters:

Name Type Description Default
max_audio_length int

New maximum sequence length.

required
Source code in vllm/model_executor/models/cohere_asr.py
def set_max_audio_length(self, max_audio_length: int) -> None:
    """
    Sets maximum input length.
    Pre-calculates internal seq_range mask.

    Args:
        max_audio_length (int): New maximum sequence length.
    """
    device = next(self.parameters()).device
    dtype = next(self.parameters()).dtype
    self.pos_enc.extend_pe(max_audio_length, device, dtype)

ConformerFeedForward

Bases: Module

feed-forward module of Conformer model. use_bias (bool): Apply bias to all Linear and Conv1d layers to improve activation flow and stabilize training of huge models.

Source code in vllm/model_executor/models/cohere_asr.py
class ConformerFeedForward(nn.Module):
    """
    feed-forward module of Conformer model.
    use_bias (bool): Apply bias to all Linear and Conv1d
        layers to improve activation flow and stabilize
        training of huge models.
    """

    def __init__(
        self,
        d_model: int,
        d_ff: int,
        activation: nn.Module | None = None,
        use_bias: bool = True,
    ) -> None:
        super().__init__()
        if activation is None:
            activation = Swish()
        self.linear1 = nn.Linear(d_model, d_ff, bias=use_bias)
        self.activation = activation
        self.linear2 = nn.Linear(d_ff, d_model, bias=use_bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        return x

ConformerLayer

Bases: Module

A single block of the Conformer encoder.

Parameters:

Name Type Description Default
d_model int

input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward

required
d_ff int

hidden dimension of PositionwiseFeedForward

required
self_attention_model str

type of the attention layer and positional encoding

'rel_pos'
n_heads int

number of heads for multi-head attention

4
conv_kernel_size int

kernel size for depthwise convolution in convolution module

31
use_bias bool

Apply bias to all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models. Defaults to True.

True
Source code in vllm/model_executor/models/cohere_asr.py
class ConformerLayer(torch.nn.Module):
    """A single block of the Conformer encoder.

    Args:
        d_model (int): input dimension of
            MultiheadAttentionMechanism and
            PositionwiseFeedForward
        d_ff (int): hidden dimension of
            PositionwiseFeedForward
        self_attention_model (str): type of the attention
            layer and positional encoding
        n_heads (int): number of heads for multi-head
            attention
        conv_kernel_size (int): kernel size for depthwise
            convolution in convolution module
        use_bias (bool): Apply bias to all Linear and
            Conv1d layers from each ConformerLayer to
            improve activation flow and stabilize training
            of huge models. Defaults to True.
    """

    def __init__(
        self,
        d_model: int,
        d_ff: int,
        self_attention_model: str = "rel_pos",
        n_heads: int = 4,
        conv_kernel_size: int = 31,
        conv_norm_type: str = "batch_norm",
        conv_context_size: int | None = None,
        pos_bias_u: nn.Parameter | torch.Tensor | None = None,
        pos_bias_v: nn.Parameter | torch.Tensor | None = None,
        att_context_size: list[int] | None = None,
        use_bias: bool = True,
    ) -> None:
        super().__init__()
        if att_context_size is None:
            att_context_size = [-1, -1]

        self.self_attention_model = self_attention_model
        self.fc_factor = 0.5

        # first feed forward module
        self.norm_feed_forward1 = nn.LayerNorm(d_model)
        self.feed_forward1 = ConformerFeedForward(
            d_model=d_model, d_ff=d_ff, use_bias=use_bias
        )

        # convolution module
        self.norm_conv = nn.LayerNorm(d_model)
        self.conv = ConformerConvolution(
            d_model=d_model,
            kernel_size=conv_kernel_size,
            norm_type=conv_norm_type,
            conv_context_size=conv_context_size,
            use_bias=use_bias,
        )

        # multi-headed self-attention module
        self.norm_self_att = nn.LayerNorm(d_model)

        assert self_attention_model == "rel_pos"

        self.self_attn = RelPositionMultiHeadAttention(
            n_head=n_heads,
            n_feat=d_model,
            pos_bias_u=pos_bias_u,
            pos_bias_v=pos_bias_v,
            use_bias=use_bias,
        )

        # second feed forward module
        self.norm_feed_forward2 = nn.LayerNorm(d_model)
        self.feed_forward2 = ConformerFeedForward(
            d_model=d_model, d_ff=d_ff, use_bias=use_bias
        )

        self.norm_out = nn.LayerNorm(d_model)

    def forward(
        self,
        x: torch.Tensor,
        att_mask: torch.Tensor | None = None,
        pos_emb: torch.Tensor | None = None,
        pad_mask: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): input signals (B, T, d_model)
            att_mask (torch.Tensor): attention masks(B, T, T)
            pos_emb (torch.Tensor): (L, 1, d_model)
            pad_mask (torch.tensor): padding mask
        Returns:
            x (torch.Tensor): (B, T, d_model)
        """
        residual = x
        x = self.norm_feed_forward1(x)
        x = self.feed_forward1(x)
        residual = residual + x * self.fc_factor

        x = self.norm_self_att(residual)
        if self.self_attention_model == "rel_pos":
            x = self.self_attn(
                query=x,
                key=x,
                value=x,
                mask=att_mask,
                pos_emb=pos_emb,
            )
        elif self.self_attention_model == "rel_pos_local_attn":
            x = self.self_attn(
                query=x,
                key=x,
                value=x,
                pad_mask=pad_mask,
                pos_emb=pos_emb,
            )
        elif self.self_attention_model == "abs_pos":
            x = self.self_attn(query=x, key=x, value=x, mask=att_mask)
        else:
            x = None

        residual = residual + x

        x = self.norm_conv(residual)
        x = self.conv(x, pad_mask=pad_mask)
        residual = residual + x

        x = self.norm_feed_forward2(residual)
        x = self.feed_forward2(x)
        residual = residual + x * self.fc_factor

        x = self.norm_out(residual)

        return x

forward

forward(
    x: Tensor,
    att_mask: Tensor | None = None,
    pos_emb: Tensor | None = None,
    pad_mask: Tensor | None = None,
) -> Tensor

Parameters:

Name Type Description Default
x Tensor

input signals (B, T, d_model)

required
att_mask Tensor

attention masks(B, T, T)

None
pos_emb Tensor

(L, 1, d_model)

None
pad_mask tensor

padding mask

None

Returns: x (torch.Tensor): (B, T, d_model)

Source code in vllm/model_executor/models/cohere_asr.py
def forward(
    self,
    x: torch.Tensor,
    att_mask: torch.Tensor | None = None,
    pos_emb: torch.Tensor | None = None,
    pad_mask: torch.Tensor | None = None,
) -> torch.Tensor:
    """
    Args:
        x (torch.Tensor): input signals (B, T, d_model)
        att_mask (torch.Tensor): attention masks(B, T, T)
        pos_emb (torch.Tensor): (L, 1, d_model)
        pad_mask (torch.tensor): padding mask
    Returns:
        x (torch.Tensor): (B, T, d_model)
    """
    residual = x
    x = self.norm_feed_forward1(x)
    x = self.feed_forward1(x)
    residual = residual + x * self.fc_factor

    x = self.norm_self_att(residual)
    if self.self_attention_model == "rel_pos":
        x = self.self_attn(
            query=x,
            key=x,
            value=x,
            mask=att_mask,
            pos_emb=pos_emb,
        )
    elif self.self_attention_model == "rel_pos_local_attn":
        x = self.self_attn(
            query=x,
            key=x,
            value=x,
            pad_mask=pad_mask,
            pos_emb=pos_emb,
        )
    elif self.self_attention_model == "abs_pos":
        x = self.self_attn(query=x, key=x, value=x, mask=att_mask)
    else:
        x = None

    residual = residual + x

    x = self.norm_conv(residual)
    x = self.conv(x, pad_mask=pad_mask)
    residual = residual + x

    x = self.norm_feed_forward2(residual)
    x = self.feed_forward2(x)
    residual = residual + x * self.fc_factor

    x = self.norm_out(residual)

    return x

ConvSubsampling

Bases: Module

Source code in vllm/model_executor/models/cohere_asr.py
class ConvSubsampling(nn.Module):
    def __init__(
        self,
        subsampling: str,
        subsampling_factor: int,
        feat_in: int,
        feat_out: int,
        conv_channels: int,
        subsampling_conv_chunking_factor: int = 1,
        activation: nn.Module | None = None,
        is_causal: bool = False,
    ) -> None:
        super().__init__()
        if activation is None:
            activation = nn.ReLU()

        if subsampling_factor % 2 != 0:
            raise ValueError("Sampling factor should be a multiply of 2!")
        self._sampling_num = int(math.log(subsampling_factor, 2))

        if (
            subsampling_conv_chunking_factor != -1
            and subsampling_conv_chunking_factor != 1
            and subsampling_conv_chunking_factor % 2 != 0
        ):
            raise ValueError(
                "subsampling_conv_chunking_factor should be -1, 1, or a power of 2"
            )

        in_channels = 1
        layers = []

        assert subsampling == "dw_striding"
        self._stride = 2
        self._kernel_size = 3
        self._ceil_mode = False

        assert not is_causal

        self._left_padding = (self._kernel_size - 1) // 2
        self._right_padding = (self._kernel_size - 1) // 2

        # Layer 1
        # [1, T, num_melspec] -> [conv_channels, T//2, num_melspec//2]
        layers.append(
            torch.nn.Conv2d(
                in_channels=in_channels,
                out_channels=conv_channels,
                kernel_size=self._kernel_size,
                stride=self._stride,
                padding=self._left_padding,
            )
        )
        in_channels = conv_channels
        layers.append(activation)

        for i in range(self._sampling_num - 1):
            # [conv_channels, T//2^i, num_melspec//2^i] ->
            # [conv_channels, T//2^(i+1), num_melspec//2^(i+1)]
            # depthwise conv
            layers.append(
                torch.nn.Conv2d(
                    in_channels=in_channels,
                    out_channels=in_channels,
                    kernel_size=self._kernel_size,
                    stride=self._stride,
                    padding=self._left_padding,
                    groups=in_channels,
                )
            )

            # [conv_channels, T//2^(i+1), num_melspec//2^(i+1)]
            # -> [conv_channels, T//2^(i+1), num_melspec//2^(i+1)]
            # pointwise conv
            layers.append(
                torch.nn.Conv2d(
                    in_channels=in_channels,
                    out_channels=conv_channels,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    groups=1,
                )
            )
            layers.append(activation)
            in_channels = conv_channels

        in_length = torch.tensor(feat_in, dtype=torch.float)
        out_length = self.calc_length(
            lengths=in_length,
            all_paddings=self._left_padding + self._right_padding,
            kernel_size=self._kernel_size,
            stride=self._stride,
            ceil_mode=self._ceil_mode,
            repeat_num=self._sampling_num,
        )

        # reshape:
        # [conv_channels, T//sub_factor, num_melspec//sub_factor]
        # -> [T//sub_factor, conv_channels * (num_melspec//sub_factor)]
        # mlp:
        # [T//sub_factor, conv_channels * (num_melspec//sub_factor)]
        # -> [T//sub_factor, feat_out]
        self.out = torch.nn.Linear(conv_channels * int(out_length), feat_out)
        self.conv2d_subsampling = True
        self.conv = MaskedConvSequential(*layers)

    def calc_length(
        self,
        lengths: torch.Tensor,
        all_paddings: int,
        kernel_size: int,
        stride: int,
        ceil_mode: bool,
        repeat_num: int = 1,
    ) -> torch.Tensor:
        """Calculates the output length of a Tensor passed
        through a convolution or max pooling layer"""
        add_pad: float = all_paddings - kernel_size
        one: float = 1.0
        for i in range(repeat_num):
            lengths = torch.div(lengths.to(dtype=torch.float) + add_pad, stride) + one
            lengths = torch.ceil(lengths) if ceil_mode else torch.floor(lengths)
        return lengths.to(dtype=torch.int)

    def forward(
        self, x: torch.Tensor, lengths: torch.Tensor
    ) -> tuple[torch.Tensor, torch.Tensor]:
        x, lengths = self.conv(x, lengths)

        if self.conv2d_subsampling:
            b, c, t, f = x.size()
            x = self.out(x.transpose(1, 2).reshape(b, t, -1))
        # Transpose to Channel Last mode
        else:
            x = x.transpose(1, 2)

        return x, lengths

calc_length

calc_length(
    lengths: Tensor,
    all_paddings: int,
    kernel_size: int,
    stride: int,
    ceil_mode: bool,
    repeat_num: int = 1,
) -> Tensor

Calculates the output length of a Tensor passed through a convolution or max pooling layer

Source code in vllm/model_executor/models/cohere_asr.py
def calc_length(
    self,
    lengths: torch.Tensor,
    all_paddings: int,
    kernel_size: int,
    stride: int,
    ceil_mode: bool,
    repeat_num: int = 1,
) -> torch.Tensor:
    """Calculates the output length of a Tensor passed
    through a convolution or max pooling layer"""
    add_pad: float = all_paddings - kernel_size
    one: float = 1.0
    for i in range(repeat_num):
        lengths = torch.div(lengths.to(dtype=torch.float) + add_pad, stride) + one
        lengths = torch.ceil(lengths) if ceil_mode else torch.floor(lengths)
    return lengths.to(dtype=torch.int)

FixedPositionalEncoding

Bases: Module

Fixed positional encoding (embedding layer) from sine and cosine functions of different frequencies according to https://arxiv.org/abs/1706.03762

Parameters:

Name Type Description Default
hidden_size int

size of the embeddings in the model, also known as d_model

required
max_sequence_length int

maximum allowed length of the input sequence

512
Source code in vllm/model_executor/models/cohere_asr.py
class FixedPositionalEncoding(nn.Module):
    """
    Fixed positional encoding (embedding layer) from sine and cosine functions
    of different frequencies according to https://arxiv.org/abs/1706.03762

    Args:
        hidden_size: size of the embeddings in the model, also known as d_model
        max_sequence_length: maximum allowed length of the input sequence
    """

    def __init__(self, hidden_size: int, max_sequence_length: int = 512) -> None:
        super().__init__()

        self._hidden_size = hidden_size
        self._max_sequence_length = max_sequence_length
        self._build_pos_enc(
            hidden_size=self._hidden_size, max_sequence_length=self._max_sequence_length
        )

    def _build_pos_enc(self, hidden_size: int, max_sequence_length: int) -> None:
        """Builds/replaces pre-computed positional encoding."""
        pos_enc = torch.zeros(max_sequence_length, hidden_size)
        position = torch.arange(0.0, max_sequence_length).unsqueeze(1)
        coef = -math.log(10000.0) / hidden_size
        div_term = torch.exp(coef * torch.arange(0.0, hidden_size, 2))
        pos_enc[:, 0::2] = torch.sin(position * div_term)
        pos_enc[:, 1::2] = torch.cos(position * div_term)
        pos_enc.div_(math.sqrt(hidden_size))
        self.register_buffer("pos_enc", pos_enc)

    def forward(self, position_ids: torch.Tensor) -> torch.Tensor:
        embeddings = torch.embedding(self.pos_enc, position_ids)
        return embeddings

_build_pos_enc

_build_pos_enc(
    hidden_size: int, max_sequence_length: int
) -> None

Builds/replaces pre-computed positional encoding.

Source code in vllm/model_executor/models/cohere_asr.py
def _build_pos_enc(self, hidden_size: int, max_sequence_length: int) -> None:
    """Builds/replaces pre-computed positional encoding."""
    pos_enc = torch.zeros(max_sequence_length, hidden_size)
    position = torch.arange(0.0, max_sequence_length).unsqueeze(1)
    coef = -math.log(10000.0) / hidden_size
    div_term = torch.exp(coef * torch.arange(0.0, hidden_size, 2))
    pos_enc[:, 0::2] = torch.sin(position * div_term)
    pos_enc[:, 1::2] = torch.cos(position * div_term)
    pos_enc.div_(math.sqrt(hidden_size))
    self.register_buffer("pos_enc", pos_enc)

MaskedConvSequential

Bases: Sequential

Source code in vllm/model_executor/models/cohere_asr.py
class MaskedConvSequential(nn.Sequential):
    def forward(
        self, x: torch.Tensor, lengths: torch.Tensor
    ) -> tuple[torch.Tensor, torch.Tensor]:
        x = x.unsqueeze(1)  # (batch, 1, time, features)
        current_lengths = lengths.clone().float()
        mask = self._create_mask(x, current_lengths.long())

        # Process through each layer with mask propagation
        for i, layer in enumerate(self):
            # Apply current mask before layer
            x = self.apply_channel_mask(x, mask)

            # Apply layer
            x = layer(x)

            # Update lengths for stride operations with proper padding
            if hasattr(layer, "stride") and layer.stride != (1, 1):
                if hasattr(layer, "_left_padding"):
                    padding = (
                        layer._left_padding,
                        layer._right_padding,
                    )  # CausalConv2D
                else:
                    padding = layer.padding
                current_lengths = self.calculate_conv_output_size(
                    current_lengths, layer.kernel_size[0], layer.stride[0], padding
                )
                mask = self._create_mask(x, current_lengths.long())

        # Final masking
        x = self.apply_channel_mask(x, mask)
        return x, current_lengths.long()

    def _create_mask(self, tensor: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
        """Create broadcastable mask from per-sample lengths.

        Returns a (B, 1, T, 1) mask that broadcasts over channels and
        features without materializing a full (B, C, T, F) tensor.
        """
        batch_size, channels, time, features = tensor.shape
        time_mask = torch.arange(time, device=tensor.device).expand(
            batch_size, time
        ) < lengths.unsqueeze(1)
        return time_mask.to(tensor.dtype).unsqueeze(1).unsqueeze(-1)

    def apply_channel_mask(
        self, tensor: torch.Tensor, mask: torch.Tensor
    ) -> torch.Tensor:
        """Apply mask in-place via broadcasting.

        tensor: (B, C, T, F),  mask: (B, 1, T, 1)
        """
        tensor.mul_(mask)
        return tensor

    def calculate_conv_output_size(
        self,
        input_size: torch.Tensor,
        kernel_size: int,
        stride: int,
        padding: tuple[int, int],
    ):
        """Calculate exact output size after convolution."""
        return (input_size + padding[0] + padding[1] - kernel_size) // stride + 1

_create_mask

_create_mask(tensor: Tensor, lengths: Tensor) -> Tensor

Create broadcastable mask from per-sample lengths.

Returns a (B, 1, T, 1) mask that broadcasts over channels and features without materializing a full (B, C, T, F) tensor.

Source code in vllm/model_executor/models/cohere_asr.py
def _create_mask(self, tensor: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
    """Create broadcastable mask from per-sample lengths.

    Returns a (B, 1, T, 1) mask that broadcasts over channels and
    features without materializing a full (B, C, T, F) tensor.
    """
    batch_size, channels, time, features = tensor.shape
    time_mask = torch.arange(time, device=tensor.device).expand(
        batch_size, time
    ) < lengths.unsqueeze(1)
    return time_mask.to(tensor.dtype).unsqueeze(1).unsqueeze(-1)

apply_channel_mask

apply_channel_mask(tensor: Tensor, mask: Tensor) -> Tensor

Apply mask in-place via broadcasting.

tensor: (B, C, T, F), mask: (B, 1, T, 1)

Source code in vllm/model_executor/models/cohere_asr.py
def apply_channel_mask(
    self, tensor: torch.Tensor, mask: torch.Tensor
) -> torch.Tensor:
    """Apply mask in-place via broadcasting.

    tensor: (B, C, T, F),  mask: (B, 1, T, 1)
    """
    tensor.mul_(mask)
    return tensor

calculate_conv_output_size

calculate_conv_output_size(
    input_size: Tensor,
    kernel_size: int,
    stride: int,
    padding: tuple[int, int],
)

Calculate exact output size after convolution.

Source code in vllm/model_executor/models/cohere_asr.py
def calculate_conv_output_size(
    self,
    input_size: torch.Tensor,
    kernel_size: int,
    stride: int,
    padding: tuple[int, int],
):
    """Calculate exact output size after convolution."""
    return (input_size + padding[0] + padding[1] - kernel_size) // stride + 1

PositionalEncoding

Bases: Module

Fixed sinusoidal positional encoding. Args: d_model (int): embedding dim max_len (int): maximum input length xscale (bool): whether to scale the input by sqrt(d_model)

Source code in vllm/model_executor/models/cohere_asr.py
class PositionalEncoding(torch.nn.Module):
    """Fixed sinusoidal positional encoding.
    Args:
        d_model (int): embedding dim
        max_len (int): maximum input length
        xscale (bool): whether to scale the input by sqrt(d_model)
    """

    def __init__(
        self, d_model: int, max_len: int = 5000, xscale: float | None = None
    ) -> None:
        super().__init__()
        self.d_model = d_model
        self.xscale = xscale
        self.max_len = max_len

    def create_pe(self, positions: torch.Tensor, dtype: torch.dtype) -> None:
        pos_length = positions.size(0)
        pe = torch.zeros(pos_length, self.d_model, device=positions.device)
        div_term = torch.exp(
            torch.arange(
                0, self.d_model, 2, dtype=torch.float32, device=positions.device
            )
            * -(math.log(10000.0) / self.d_model)
        )
        pe[:, 0::2] = torch.sin(positions * div_term)
        pe[:, 1::2] = torch.cos(positions * div_term)
        pe = pe.unsqueeze(0).to(dtype)
        if hasattr(self, "pe"):
            self.pe = pe
        else:
            self.register_buffer("pe", pe, persistent=False)

    def forward(
        self, x: torch.Tensor, cache_len: int = 0
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Adds positional encoding.
        Args:
            x (torch.Tensor): Input. Its shape is (batch, time, feature_size)
            cache_len (int): the size of the cache which is used to shift positions
        Returns:
            x+pos_emb (torch.Tensor): Its shape is (batch, time, feature_size)
            pos_emb (torch.Tensor): Its shape is (1, time, feature_size)
        """
        input_len = x.size(1) + cache_len
        if self.xscale:
            x = x * self.xscale
        pos_emb = self.pe[:, :input_len]
        x = x + pos_emb
        return x, pos_emb

forward

forward(
    x: Tensor, cache_len: int = 0
) -> tuple[Tensor, Tensor]

Adds positional encoding. Args: x (torch.Tensor): Input. Its shape is (batch, time, feature_size) cache_len (int): the size of the cache which is used to shift positions Returns: x+pos_emb (torch.Tensor): Its shape is (batch, time, feature_size) pos_emb (torch.Tensor): Its shape is (1, time, feature_size)

Source code in vllm/model_executor/models/cohere_asr.py
def forward(
    self, x: torch.Tensor, cache_len: int = 0
) -> tuple[torch.Tensor, torch.Tensor]:
    """Adds positional encoding.
    Args:
        x (torch.Tensor): Input. Its shape is (batch, time, feature_size)
        cache_len (int): the size of the cache which is used to shift positions
    Returns:
        x+pos_emb (torch.Tensor): Its shape is (batch, time, feature_size)
        pos_emb (torch.Tensor): Its shape is (1, time, feature_size)
    """
    input_len = x.size(1) + cache_len
    if self.xscale:
        x = x * self.xscale
    pos_emb = self.pe[:, :input_len]
    x = x + pos_emb
    return x, pos_emb

RelPositionMultiHeadAttention

Bases: CohereASRMultiHeadAttention

Multi-Head Attention layer of Transformer-XL with support of relative positional encoding. Paper: https://arxiv.org/abs/1901.02860 Args: n_head (int): number of heads n_feat (int): size of the features use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention

Source code in vllm/model_executor/models/cohere_asr.py
class RelPositionMultiHeadAttention(CohereASRMultiHeadAttention):
    """Multi-Head Attention layer of Transformer-XL with
    support of relative positional encoding.
    Paper: https://arxiv.org/abs/1901.02860
    Args:
        n_head (int): number of heads
        n_feat (int): size of the features
        use_bias (bool): whether to apply bias in linear
            and conv layers of MultiHeadAttention
    """

    def __init__(
        self,
        n_head: int,
        n_feat: int,
        pos_bias_u: nn.Parameter | torch.Tensor | None,
        pos_bias_v: nn.Parameter | torch.Tensor | None,
        use_bias: bool = True,
    ) -> None:
        """Construct an RelPositionMultiHeadedAttention object."""
        super().__init__(
            n_head=n_head,
            n_feat=n_feat,
            use_bias=use_bias,
        )
        # linear transformation for positional encoding
        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
        # these two learnable biases are used in matrix c and matrix d
        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
        if pos_bias_u is None or pos_bias_v is None:
            self.pos_bias_u = nn.Parameter(
                torch.zeros(self.h, self.d_k), requires_grad=False
            )
            self.pos_bias_v = nn.Parameter(
                torch.zeros(self.h, self.d_k), requires_grad=False
            )
        else:
            self.pos_bias_u = pos_bias_u
            self.pos_bias_v = pos_bias_v

    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
        """Compute relative positional encoding.
        Args:
            x (torch.Tensor): (batch, nheads, time, 2*time-1)
        """
        b, h, qlen, pos_len = x.size()  # (b, h, t1, t2)
        # need to add a column of zeros on the left side of
        # last dimension to perform the relative shifting
        x = torch.nn.functional.pad(x, pad=(1, 0))  # (b, h, t1, t2+1)
        x = x.view(b, h, -1, qlen)  # (b, h, t2+1, t1)
        # need to drop the first row
        x = x[:, :, 1:].view(b, h, qlen, pos_len)  # (b, h, t1, t2)
        return x

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: torch.Tensor | None,
        pos_emb: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
        Args:
            query (torch.Tensor): (batch, time1, size)
            key (torch.Tensor): (batch, time2, size)
            value(torch.Tensor): (batch, time2, size)
            mask (torch.Tensor): (batch, time1, time2)
            pos_emb (torch.Tensor) : (batch, time1, size)

        Returns:
            output (torch.Tensor): transformed `value`
                (batch, time1, d_model) weighted by the
                query dot key attention
        """
        q, k, v = self.forward_qkv(query, key, value)
        q = q.transpose(1, 2)  # (batch, time1, head, d_k)

        n_batch_pos = pos_emb.size(0)
        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
        p = p.transpose(1, 2)  # (batch, head, time1, d_k)

        # (batch, head, time1, d_k)
        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
        # (batch, head, time1, d_k)
        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)

        # compute attention score
        # first compute matrix a and matrix c
        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
        # (batch, head, time1, time2)

        # compute matrix b and matrix d
        # (batch, head, time1, time2)
        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
        matrix_bd = self.rel_shift(matrix_bd)

        # drops extra elements in the matrix_bd to match the matrix_ac's size
        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
        matrix_bd = matrix_bd[:, :, :, : matrix_ac.size(-1)]
        scores = (matrix_ac + matrix_bd) / self.s_d_k  # (batch, head, time1, time2)
        return self.forward_attention(v, scores, mask)

__init__

__init__(
    n_head: int,
    n_feat: int,
    pos_bias_u: Parameter | Tensor | None,
    pos_bias_v: Parameter | Tensor | None,
    use_bias: bool = True,
) -> None

Construct an RelPositionMultiHeadedAttention object.

Source code in vllm/model_executor/models/cohere_asr.py
def __init__(
    self,
    n_head: int,
    n_feat: int,
    pos_bias_u: nn.Parameter | torch.Tensor | None,
    pos_bias_v: nn.Parameter | torch.Tensor | None,
    use_bias: bool = True,
) -> None:
    """Construct an RelPositionMultiHeadedAttention object."""
    super().__init__(
        n_head=n_head,
        n_feat=n_feat,
        use_bias=use_bias,
    )
    # linear transformation for positional encoding
    self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
    # these two learnable biases are used in matrix c and matrix d
    # as described in https://arxiv.org/abs/1901.02860 Section 3.3
    if pos_bias_u is None or pos_bias_v is None:
        self.pos_bias_u = nn.Parameter(
            torch.zeros(self.h, self.d_k), requires_grad=False
        )
        self.pos_bias_v = nn.Parameter(
            torch.zeros(self.h, self.d_k), requires_grad=False
        )
    else:
        self.pos_bias_u = pos_bias_u
        self.pos_bias_v = pos_bias_v

forward

forward(
    query: Tensor,
    key: Tensor,
    value: Tensor,
    mask: Tensor | None,
    pos_emb: Tensor | None = None,
) -> Tensor

Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: query (torch.Tensor): (batch, time1, size) key (torch.Tensor): (batch, time2, size) value(torch.Tensor): (batch, time2, size) mask (torch.Tensor): (batch, time1, time2) pos_emb (torch.Tensor) : (batch, time1, size)

Returns:

Name Type Description
output Tensor

transformed value (batch, time1, d_model) weighted by the query dot key attention

Source code in vllm/model_executor/models/cohere_asr.py
def forward(
    self,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: torch.Tensor | None,
    pos_emb: torch.Tensor | None = None,
) -> torch.Tensor:
    """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
    Args:
        query (torch.Tensor): (batch, time1, size)
        key (torch.Tensor): (batch, time2, size)
        value(torch.Tensor): (batch, time2, size)
        mask (torch.Tensor): (batch, time1, time2)
        pos_emb (torch.Tensor) : (batch, time1, size)

    Returns:
        output (torch.Tensor): transformed `value`
            (batch, time1, d_model) weighted by the
            query dot key attention
    """
    q, k, v = self.forward_qkv(query, key, value)
    q = q.transpose(1, 2)  # (batch, time1, head, d_k)

    n_batch_pos = pos_emb.size(0)
    p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
    p = p.transpose(1, 2)  # (batch, head, time1, d_k)

    # (batch, head, time1, d_k)
    q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
    # (batch, head, time1, d_k)
    q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)

    # compute attention score
    # first compute matrix a and matrix c
    # as described in https://arxiv.org/abs/1901.02860 Section 3.3
    # (batch, head, time1, time2)

    # compute matrix b and matrix d
    # (batch, head, time1, time2)
    matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
    matrix_bd = self.rel_shift(matrix_bd)

    # drops extra elements in the matrix_bd to match the matrix_ac's size
    matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
    matrix_bd = matrix_bd[:, :, :, : matrix_ac.size(-1)]
    scores = (matrix_ac + matrix_bd) / self.s_d_k  # (batch, head, time1, time2)
    return self.forward_attention(v, scores, mask)

rel_shift

rel_shift(x: Tensor) -> Tensor

Compute relative positional encoding. Args: x (torch.Tensor): (batch, nheads, time, 2*time-1)

Source code in vllm/model_executor/models/cohere_asr.py
def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
    """Compute relative positional encoding.
    Args:
        x (torch.Tensor): (batch, nheads, time, 2*time-1)
    """
    b, h, qlen, pos_len = x.size()  # (b, h, t1, t2)
    # need to add a column of zeros on the left side of
    # last dimension to perform the relative shifting
    x = torch.nn.functional.pad(x, pad=(1, 0))  # (b, h, t1, t2+1)
    x = x.view(b, h, -1, qlen)  # (b, h, t2+1, t1)
    # need to drop the first row
    x = x[:, :, 1:].view(b, h, qlen, pos_len)  # (b, h, t1, t2)
    return x

RelPositionalEncoding

Bases: PositionalEncoding

Relative positional encoding for TransformerXL's layers See : Appendix B in https://arxiv.org/abs/1901.02860 Args: d_model (int): embedding dim max_len (int): maximum input length xscale (bool): whether to scale the input by sqrt(d_model)

Source code in vllm/model_executor/models/cohere_asr.py
class RelPositionalEncoding(PositionalEncoding):
    """Relative positional encoding for TransformerXL's layers
    See : Appendix B in https://arxiv.org/abs/1901.02860
    Args:
        d_model (int): embedding dim
        max_len (int): maximum input length
        xscale (bool): whether to scale the input by sqrt(d_model)
    """

    def extend_pe(self, length: int, device: torch.device, dtype: torch.dtype) -> None:
        """Reset and extend the positional encodings if needed."""
        needed_size = 2 * length - 1
        if hasattr(self, "pe") and self.pe.size(1) >= needed_size:
            return
        positions = torch.arange(
            length - 1, -length, -1, dtype=torch.float32, device=device
        ).unsqueeze(1)
        self.create_pe(positions=positions, dtype=dtype)

    def forward(
        self, x: torch.Tensor, cache_len: int = 0
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Compute positional encoding.
        Args:
            x (torch.Tensor): Input. Its shape is (batch, time, feature_size)
            cache_len (int): the size of the cache which is used to shift positions
        Returns:
            x (torch.Tensor): Its shape is (batch, time, feature_size)
            pos_emb (torch.Tensor): Its shape is (1, time, feature_size)
        """

        if self.xscale:
            x = x * self.xscale

        input_len = x.size(1) + cache_len
        center_pos = self.pe.size(1) // 2 + 1
        start_pos = center_pos - input_len
        end_pos = center_pos + input_len - 1
        pos_emb = self.pe[:, start_pos:end_pos]

        return x, pos_emb

extend_pe

extend_pe(
    length: int, device: device, dtype: dtype
) -> None

Reset and extend the positional encodings if needed.

Source code in vllm/model_executor/models/cohere_asr.py
def extend_pe(self, length: int, device: torch.device, dtype: torch.dtype) -> None:
    """Reset and extend the positional encodings if needed."""
    needed_size = 2 * length - 1
    if hasattr(self, "pe") and self.pe.size(1) >= needed_size:
        return
    positions = torch.arange(
        length - 1, -length, -1, dtype=torch.float32, device=device
    ).unsqueeze(1)
    self.create_pe(positions=positions, dtype=dtype)

forward

forward(
    x: Tensor, cache_len: int = 0
) -> tuple[Tensor, Tensor]

Compute positional encoding. Args: x (torch.Tensor): Input. Its shape is (batch, time, feature_size) cache_len (int): the size of the cache which is used to shift positions Returns: x (torch.Tensor): Its shape is (batch, time, feature_size) pos_emb (torch.Tensor): Its shape is (1, time, feature_size)

Source code in vllm/model_executor/models/cohere_asr.py
def forward(
    self, x: torch.Tensor, cache_len: int = 0
) -> tuple[torch.Tensor, torch.Tensor]:
    """Compute positional encoding.
    Args:
        x (torch.Tensor): Input. Its shape is (batch, time, feature_size)
        cache_len (int): the size of the cache which is used to shift positions
    Returns:
        x (torch.Tensor): Its shape is (batch, time, feature_size)
        pos_emb (torch.Tensor): Its shape is (1, time, feature_size)
    """

    if self.xscale:
        x = x * self.xscale

    input_len = x.size(1) + cache_len
    center_pos = self.pe.size(1) // 2 + 1
    start_pos = center_pos - input_len
    end_pos = center_pos + input_len - 1
    pos_emb = self.pe[:, start_pos:end_pos]

    return x, pos_emb

Swish

Bases: SiLU

Swish activation function introduced in 'https://arxiv.org/abs/1710.05941' Mathematically identical to SiLU. See note in nn.SiLU for references.

Source code in vllm/model_executor/models/cohere_asr.py
class Swish(nn.SiLU):
    """
    Swish activation function introduced in 'https://arxiv.org/abs/1710.05941'
    Mathematically identical to SiLU. See note in nn.SiLU for references.
    """