},
        .qdma = {
                .qtx_cfg        = 0x1800,
+               .qtx_sch        = 0x1804,
                .rx_ptr         = 0x1900,
                .rx_cnt_cfg     = 0x1904,
                .qcrx_ptr       = 0x1908,
                .rst_idx        = 0x1a08,
                .delay_irq      = 0x1a0c,
                .fc_th          = 0x1a10,
+               .tx_sch_rate    = 0x1a14,
                .int_grp        = 0x1a20,
                .hred           = 0x1a44,
                .ctx_ptr        = 0x1b00,
        },
        .qdma = {
                .qtx_cfg        = 0x4400,
+               .qtx_sch        = 0x4404,
                .rx_ptr         = 0x4500,
                .rx_cnt_cfg     = 0x4504,
                .qcrx_ptr       = 0x4508,
                .fq_tail        = 0x4724,
                .fq_count       = 0x4728,
                .fq_blen        = 0x472c,
+               .tx_sch_rate    = 0x4798,
        },
        .gdm1_cnt               = 0x1c00,
        .gdma_to_ppe            = 0x3333,
        mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id));
 }
 
+static void mtk_set_queue_speed(struct mtk_eth *eth, unsigned int idx,
+                               int speed)
+{
+       const struct mtk_soc_data *soc = eth->soc;
+       u32 ofs, val;
+
+       if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA))
+               return;
+
+       val = MTK_QTX_SCH_MIN_RATE_EN |
+             /* minimum: 10 Mbps */
+             FIELD_PREP(MTK_QTX_SCH_MIN_RATE_MAN, 1) |
+             FIELD_PREP(MTK_QTX_SCH_MIN_RATE_EXP, 4) |
+             MTK_QTX_SCH_LEAKY_BUCKET_SIZE;
+       if (!MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
+               val |= MTK_QTX_SCH_LEAKY_BUCKET_EN;
+
+       if (IS_ENABLED(CONFIG_SOC_MT7621)) {
+               switch (speed) {
+               case SPEED_10:
+                       val |= MTK_QTX_SCH_MAX_RATE_EN |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_MAN, 103) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_EXP, 2) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_WEIGHT, 1);
+                       break;
+               case SPEED_100:
+                       val |= MTK_QTX_SCH_MAX_RATE_EN |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_MAN, 103) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_EXP, 3);
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_WEIGHT, 1);
+                       break;
+               case SPEED_1000:
+                       val |= MTK_QTX_SCH_MAX_RATE_EN |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_MAN, 105) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_EXP, 4) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_WEIGHT, 10);
+                       break;
+               default:
+                       break;
+               }
+       } else {
+               switch (speed) {
+               case SPEED_10:
+                       val |= MTK_QTX_SCH_MAX_RATE_EN |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_MAN, 1) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_EXP, 4) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_WEIGHT, 1);
+                       break;
+               case SPEED_100:
+                       val |= MTK_QTX_SCH_MAX_RATE_EN |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_MAN, 1) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_EXP, 5);
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_WEIGHT, 1);
+                       break;
+               case SPEED_1000:
+                       val |= MTK_QTX_SCH_MAX_RATE_EN |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_MAN, 10) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_EXP, 5) |
+                              FIELD_PREP(MTK_QTX_SCH_MAX_RATE_WEIGHT, 10);
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       ofs = MTK_QTX_OFFSET * idx;
+       mtk_w32(eth, val, soc->reg_map->qdma.qtx_sch + ofs);
+}
+
 static void mtk_mac_link_up(struct phylink_config *config,
                            struct phy_device *phy,
                            unsigned int mode, phy_interface_t interface,
                break;
        }
 
+       mtk_set_queue_speed(mac->hw, mac->id, speed);
+
        /* Configure duplex */
        if (duplex == DUPLEX_FULL)
                mcr |= MAC_MCR_FORCE_DPX;
 
        WRITE_ONCE(desc->txd1, info->addr);
 
-       data = TX_DMA_SWC | TX_DMA_PLEN0(info->size);
+       data = TX_DMA_SWC | TX_DMA_PLEN0(info->size) |
+              FIELD_PREP(TX_DMA_PQID, info->qid);
        if (info->last)
                data |= TX_DMA_LS0;
        WRITE_ONCE(desc->txd3, data);
                data |= TX_DMA_LS0;
        WRITE_ONCE(desc->txd3, data);
 
-       if (!info->qid && mac->id)
-               info->qid = MTK_QDMA_GMAC2_QID;
-
        data = (mac->id + 1) << TX_DMA_FPORT_SHIFT_V2; /* forward port */
        data |= TX_DMA_SWC_V2 | QID_BITS_V2(info->qid);
        WRITE_ONCE(desc->txd4, data);
                .gso = gso,
                .csum = skb->ip_summed == CHECKSUM_PARTIAL,
                .vlan = skb_vlan_tag_present(skb),
-               .qid = skb->mark & MTK_QDMA_TX_MASK,
+               .qid = skb_get_queue_mapping(skb),
                .vlan_tci = skb_vlan_tag_get(skb),
                .first = true,
                .last = !skb_is_nonlinear(skb),
        };
+       struct netdev_queue *txq;
        struct mtk_mac *mac = netdev_priv(dev);
        struct mtk_eth *eth = mac->hw;
        const struct mtk_soc_data *soc = eth->soc;
        struct mtk_tx_dma *itxd_pdma, *txd_pdma;
        struct mtk_tx_buf *itx_buf, *tx_buf;
        int i, n_desc = 1;
+       int queue = skb_get_queue_mapping(skb);
        int k = 0;
 
+       txq = netdev_get_tx_queue(dev, queue);
        itxd = ring->next_free;
        itxd_pdma = qdma_to_pdma(ring, itxd);
        if (itxd == ring->last_free)
                        memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info));
                        txd_info.size = min_t(unsigned int, frag_size,
                                              soc->txrx.dma_max_len);
-                       txd_info.qid = skb->mark & MTK_QDMA_TX_MASK;
+                       txd_info.qid = queue;
                        txd_info.last = i == skb_shinfo(skb)->nr_frags - 1 &&
                                        !(frag_size - txd_info.size);
                        txd_info.addr = skb_frag_dma_map(eth->dma_dev, frag,
                        txd_pdma->txd2 |= TX_DMA_LS1;
        }
 
-       netdev_sent_queue(dev, skb->len);
+       netdev_tx_sent_queue(txq, skb->len);
        skb_tx_timestamp(skb);
 
        ring->next_free = mtk_qdma_phys_to_virt(ring, txd->txd2);
        wmb();
 
        if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) {
-               if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)) ||
-                   !netdev_xmit_more())
+               if (netif_xmit_stopped(txq) || !netdev_xmit_more())
                        mtk_w32(eth, txd->txd2, soc->reg_map->qdma.ctx_ptr);
        } else {
                int next_idx;
        for (i = 0; i < MTK_MAC_COUNT; i++) {
                if (!eth->netdev[i])
                        continue;
-               netif_wake_queue(eth->netdev[i]);
+               netif_tx_wake_all_queues(eth->netdev[i]);
        }
 }
 
 
        tx_num = mtk_cal_txd_req(eth, skb);
        if (unlikely(atomic_read(&ring->free_count) <= tx_num)) {
-               netif_stop_queue(dev);
+               netif_tx_stop_all_queues(dev);
                netif_err(eth, tx_queued, dev,
                          "Tx Ring full when queue awake!\n");
                spin_unlock(ð->page_lock);
                goto drop;
 
        if (unlikely(atomic_read(&ring->free_count) <= ring->thresh))
-               netif_stop_queue(dev);
+               netif_tx_stop_all_queues(dev);
 
        spin_unlock(ð->page_lock);
 
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
        const struct mtk_soc_data *soc = eth->soc;
        struct mtk_tx_ring *ring = ð->tx_ring;
+       struct mtk_mac *mac = netdev_priv(dev);
        struct mtk_tx_dma_desc_info txd_info = {
                .size   = xdpf->len,
                .first  = true,
                .last   = !xdp_frame_has_frags(xdpf),
+               .qid    = mac->id,
        };
        int err, index = 0, n_desc = 1, nr_frags;
        struct mtk_tx_buf *htx_buf, *tx_buf;
                memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info));
                txd_info.size = skb_frag_size(&sinfo->frags[index]);
                txd_info.last = index + 1 == nr_frags;
+               txd_info.qid = mac->id;
                data = skb_frag_address(&sinfo->frags[index]);
 
                index++;
        return done;
 }
 
+struct mtk_poll_state {
+    struct netdev_queue *txq;
+    unsigned int total;
+    unsigned int done;
+    unsigned int bytes;
+};
+
+static void
+mtk_poll_tx_done(struct mtk_eth *eth, struct mtk_poll_state *state, u8 mac,
+                struct sk_buff *skb)
+{
+       struct netdev_queue *txq;
+       struct net_device *dev;
+       unsigned int bytes = skb->len;
+
+       state->total++;
+       eth->tx_packets++;
+       eth->tx_bytes += bytes;
+
+       dev = eth->netdev[mac];
+       if (!dev)
+               return;
+
+       txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+       if (state->txq == txq) {
+               state->done++;
+               state->bytes += bytes;
+               return;
+       }
+
+       if (state->txq)
+               netdev_tx_completed_queue(state->txq, state->done, state->bytes);
+
+       state->txq = txq;
+       state->done = 1;
+       state->bytes = bytes;
+}
+
 static int mtk_poll_tx_qdma(struct mtk_eth *eth, int budget,
-                           unsigned int *done, unsigned int *bytes)
+                           struct mtk_poll_state *state)
 {
        const struct mtk_reg_map *reg_map = eth->soc->reg_map;
        struct mtk_tx_ring *ring = ð->tx_ring;
                        break;
 
                if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) {
-                       if (tx_buf->type == MTK_TYPE_SKB) {
-                               struct sk_buff *skb = tx_buf->data;
+                       if (tx_buf->type == MTK_TYPE_SKB)
+                               mtk_poll_tx_done(eth, state, mac, tx_buf->data);
 
-                               bytes[mac] += skb->len;
-                               done[mac]++;
-                       }
                        budget--;
                }
                mtk_tx_unmap(eth, tx_buf, &bq, true);
 }
 
 static int mtk_poll_tx_pdma(struct mtk_eth *eth, int budget,
-                           unsigned int *done, unsigned int *bytes)
+                           struct mtk_poll_state *state)
 {
        struct mtk_tx_ring *ring = ð->tx_ring;
        struct mtk_tx_buf *tx_buf;
                        break;
 
                if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) {
-                       if (tx_buf->type == MTK_TYPE_SKB) {
-                               struct sk_buff *skb = tx_buf->data;
-
-                               bytes[0] += skb->len;
-                               done[0]++;
-                       }
+                       if (tx_buf->type == MTK_TYPE_SKB)
+                               mtk_poll_tx_done(eth, state, 0, tx_buf->data);
                        budget--;
                }
                mtk_tx_unmap(eth, tx_buf, &bq, true);
 {
        struct mtk_tx_ring *ring = ð->tx_ring;
        struct dim_sample dim_sample = {};
-       unsigned int done[MTK_MAX_DEVS];
-       unsigned int bytes[MTK_MAX_DEVS];
-       int total = 0, i;
-
-       memset(done, 0, sizeof(done));
-       memset(bytes, 0, sizeof(bytes));
+       struct mtk_poll_state state = {};
 
        if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
-               budget = mtk_poll_tx_qdma(eth, budget, done, bytes);
+               budget = mtk_poll_tx_qdma(eth, budget, &state);
        else
-               budget = mtk_poll_tx_pdma(eth, budget, done, bytes);
+               budget = mtk_poll_tx_pdma(eth, budget, &state);
 
-       for (i = 0; i < MTK_MAC_COUNT; i++) {
-               if (!eth->netdev[i] || !done[i])
-                       continue;
-               netdev_completed_queue(eth->netdev[i], done[i], bytes[i]);
-               total += done[i];
-               eth->tx_packets += done[i];
-               eth->tx_bytes += bytes[i];
-       }
+       if (state.txq)
+               netdev_tx_completed_queue(state.txq, state.done, state.bytes);
 
        dim_update_sample(eth->tx_events, eth->tx_packets, eth->tx_bytes,
                          &dim_sample);
            (atomic_read(&ring->free_count) > ring->thresh))
                mtk_wake_queue(eth);
 
-       return total;
+       return state.total;
 }
 
 static void mtk_handle_status_irq(struct mtk_eth *eth)
        int i, sz = soc->txrx.txd_size;
        struct mtk_tx_dma_v2 *txd;
        int ring_size;
+       u32 ofs, val;
 
        if (MTK_HAS_CAPS(soc->caps, MTK_QDMA))
                ring_size = MTK_QDMA_RING_SIZE;
                        ring->phys + ((ring_size - 1) * sz),
                        soc->reg_map->qdma.crx_ptr);
                mtk_w32(eth, ring->last_free_ptr, soc->reg_map->qdma.drx_ptr);
-               mtk_w32(eth, (QDMA_RES_THRES << 8) | QDMA_RES_THRES,
-                       soc->reg_map->qdma.qtx_cfg);
+
+               for (i = 0, ofs = 0; i < MTK_QDMA_NUM_QUEUES; i++) {
+                       val = (QDMA_RES_THRES << 8) | QDMA_RES_THRES;
+                       mtk_w32(eth, val, soc->reg_map->qdma.qtx_cfg + ofs);
+
+                       val = MTK_QTX_SCH_MIN_RATE_EN |
+                             /* minimum: 10 Mbps */
+                             FIELD_PREP(MTK_QTX_SCH_MIN_RATE_MAN, 1) |
+                             FIELD_PREP(MTK_QTX_SCH_MIN_RATE_EXP, 4) |
+                             MTK_QTX_SCH_LEAKY_BUCKET_SIZE;
+                       if (!MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
+                               val |= MTK_QTX_SCH_LEAKY_BUCKET_EN;
+                       mtk_w32(eth, val, soc->reg_map->qdma.qtx_sch + ofs);
+                       ofs += MTK_QTX_OFFSET;
+               }
+               val = MTK_QDMA_TX_SCH_MAX_WFQ | (MTK_QDMA_TX_SCH_MAX_WFQ << 16);
+               mtk_w32(eth, val, soc->reg_map->qdma.tx_sch_rate);
+               if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
+                       mtk_w32(eth, val, soc->reg_map->qdma.tx_sch_rate + 4);
        } else {
                mtk_w32(eth, ring->phys_pdma, MT7628_TX_BASE_PTR0);
                mtk_w32(eth, ring_size, MT7628_TX_MAX_CNT0);
                if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2))
                        val |= MTK_MUTLI_CNT | MTK_RESV_BUF |
                               MTK_WCOMP_EN | MTK_DMAD_WR_WDONE |
-                              MTK_CHK_DDONE_EN;
+                              MTK_CHK_DDONE_EN | MTK_LEAKY_BUCKET_EN;
                else
                        val |= MTK_RX_BT_32DWORDS;
                mtk_w32(eth, val, reg_map->qdma.glo_cfg);
 #endif
 }
 
+static int mtk_device_event(struct notifier_block *n, unsigned long event, void *ptr)
+{
+       struct mtk_mac *mac = container_of(n, struct mtk_mac, device_notifier);
+       struct mtk_eth *eth = mac->hw;
+       struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+       struct ethtool_link_ksettings s;
+       struct net_device *ldev;
+       struct list_head *iter;
+       struct dsa_port *dp;
+
+       if (event != NETDEV_CHANGE)
+               return NOTIFY_DONE;
+
+       netdev_for_each_lower_dev(dev, ldev, iter) {
+               if (netdev_priv(ldev) == mac)
+                       goto found;
+       }
+
+       return NOTIFY_DONE;
+
+found:
+       if (!dsa_slave_dev_check(dev))
+               return NOTIFY_DONE;
+
+       if (__ethtool_get_link_ksettings(dev, &s))
+               return NOTIFY_DONE;
+
+       if (s.base.speed == 0 || s.base.speed == ((__u32)-1))
+               return NOTIFY_DONE;
+
+       dp = dsa_port_from_netdev(dev);
+       if (dp->index >= MTK_QDMA_NUM_QUEUES)
+               return NOTIFY_DONE;
+
+       mtk_set_queue_speed(eth, dp->index + 3, s.base.speed);
+
+       return NOTIFY_DONE;
+}
+
 static int mtk_open(struct net_device *dev)
 {
        struct mtk_mac *mac = netdev_priv(dev);
                refcount_inc(ð->dma_refcnt);
 
        phylink_start(mac->phylink);
-       netif_start_queue(dev);
+       netif_tx_start_all_queues(dev);
+
        return 0;
 }
 
        int i;
 
        for (i = 0; i < MTK_MAC_COUNT; i++) {
+               struct mtk_mac *mac;
                if (!eth->netdev[i])
                        continue;
+               mac = netdev_priv(eth->netdev[i]);
+               if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
+                       unregister_netdevice_notifier(&mac->device_notifier);
                unregister_netdev(eth->netdev[i]);
        }
 
        return ret;
 }
 
+static u16 mtk_select_queue(struct net_device *dev, struct sk_buff *skb,
+                           struct net_device *sb_dev)
+{
+       struct mtk_mac *mac = netdev_priv(dev);
+       unsigned int queue = 0;
+
+       if (netdev_uses_dsa(dev))
+               queue = skb_get_queue_mapping(skb) + 3;
+       else
+               queue = mac->id;
+
+       if (queue >= dev->num_tx_queues)
+               queue = 0;
+
+       return queue;
+}
+
 static const struct ethtool_ops mtk_ethtool_ops = {
        .get_link_ksettings     = mtk_get_link_ksettings,
        .set_link_ksettings     = mtk_set_link_ksettings,
        .ndo_setup_tc           = mtk_eth_setup_tc,
        .ndo_bpf                = mtk_xdp,
        .ndo_xdp_xmit           = mtk_xdp_xmit,
+       .ndo_select_queue       = mtk_select_queue,
 };
 
 static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np)
        struct phylink *phylink;
        struct mtk_mac *mac;
        int id, err;
+       int txqs = 1;
 
        if (!_id) {
                dev_err(eth->dev, "missing mac id\n");
                return -EINVAL;
        }
 
-       eth->netdev[id] = alloc_etherdev(sizeof(*mac));
+       if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
+               txqs = MTK_QDMA_NUM_QUEUES;
+
+       eth->netdev[id] = alloc_etherdev_mqs(sizeof(*mac), txqs, 1);
        if (!eth->netdev[id]) {
                dev_err(eth->dev, "alloc_etherdev failed\n");
                return -ENOMEM;
        else
                eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN;
 
+       if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
+               mac->device_notifier.notifier_call = mtk_device_event;
+               register_netdevice_notifier(&mac->device_notifier);
+       }
+
        return 0;
 
 free_netdev: