Embedding Operators¶

CUDA Operators¶

Tensor split_embedding_codegen_lookup_adagrad_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_adam_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate = 0, double eps = 0, double beta1 = 0, double beta2 = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_approx_rowwise_adagrad_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t weight_decay_mode = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t iter = 0, int64_t counter_halflife = -1, int64_t adjustment_iter = -1, double adjustment_ub = 1.0, int64_t learning_rate_mode = -1, int64_t weight_decay_mode = 1, int64_t grad_sum_decay = -1, double max_counter = 0, double tail_id_threshold = 0.0, int64_t is_tail_id_thresh_ratio = 0, int64_t regularization_mode = 0, double weight_norm_coefficient = 0.0, double lower_bound = 0.0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t weight_decay_mode = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_approx_sgd_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, double learning_rate = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_lamb_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate = 0, double eps = 0, double beta1 = 0, double beta2 = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_lars_sgd_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate = 0, double eta = 0, double momentum = 0, double weight_decay = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_none_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, int64_t total_hash_size = 0, int64_t total_unique_indices = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_partial_rowwise_adam_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate = 0, double eps = 0, double beta1 = 0, double beta2 = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_partial_rowwise_lamb_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate = 0, double eps = 0, double beta1 = 0, double beta2 = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_rowwise_adagrad_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t weight_decay_mode = 0, double max_norm = 0.0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t iter = 0, int64_t counter_halflife = -1, int64_t adjustment_iter = -1, double adjustment_ub = 1.0, int64_t learning_rate_mode = -1, int64_t weight_decay_mode = 1, int64_t grad_sum_decay = -1, double max_counter = 0, double tail_id_threshold = 0.0, int64_t is_tail_id_thresh_ratio = 0, int64_t regularization_mode = 0, double weight_norm_coefficient = 0.0, double lower_bound = 0.0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t weight_decay_mode = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_rowwise_weighted_adagrad_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

Tensor split_embedding_codegen_lookup_sgd_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, double learning_rate = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)¶

void bounds_check_indices_cuda(Tensor &rows_per_table, Tensor &indices, Tensor &offsets, int64_t bounds_check_mode, Tensor &warning, const c10::optional<Tensor> &weights, const c10::optional<Tensor> &B_ofsets, const int64_t max_B)¶

Tensor int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias)¶

Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias, c10::optional<Tensor> cache_hash_size_cumsum, c10::optional<int64_t> total_cache_hash_size, c10::optional<Tensor> cache_index_table_map, c10::optional<Tensor> lxu_cache_state, c10::optional<Tensor> lxu_state)¶: Simlar to int_nbit_split_embedding_codegen_lookup_function, but it does UVM_CACHING lookup.

Tensor pruned_hashmap_lookup_cuda(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets)¶

Tensor pruned_array_lookup_cuda(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets)¶

CPU Operators¶

Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias)¶

Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias, c10::optional<Tensor> cache_hash_size_cumsum, c10::optional<int64_t> total_cache_hash_size, c10::optional<Tensor> cache_index_table_map, c10::optional<Tensor> lxu_cache_state, c10::optional<Tensor> lxu_state)¶

void pruned_hashmap_insert_unweighted_cpu(Tensor indices, Tensor dense_indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets)¶

Tensor pruned_hashmap_lookup_unweighted_cpu(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets)¶

Tensor pruned_array_lookup_cpu(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets)¶

Embedding Operators¶

CUDA Operators¶

CPU Operators¶

Docs

Tutorials

Resources