Debugging: tracking memory usage
import gc
import psutil
import os
process = psutil.Process(os.getpid())
with torch.no_grad():
for i, samples_page in enumerate(my_paginated_samples_loader):
batch_embeddings = model.encode(samples_page, convert_to_numpy=True)
embeddings.extend(batch_embeddings)
if i % 10 == 0: # Log every 10 batches
mem_mb = process.memory_info().rss / 1024 / 1024
print(f"Batch {i}: Memory usage: {mem_mb:.2f} MB")
gc.collect() # Force garbage collection
Try disabling gradient computation:
import torch
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B",
tokenizer_kwargs={"padding_side": "left"},
max_seq_length=512 # or 256, 384 depending on your needs
)
model.eval() # Set to eval mode
embeddings = []
with torch.no_grad(): # Critical: disable gradient tracking
for samples_page in my_paginated_samples_loader:
# to convert immediately + avoid tqdm overhead
batch_embeddings = model.encode(samples_page,
convert_to_numpy=True, show_progress_bar=False
batch_size=16,
)
embeddings.extend(batch_embeddings)
# Optional: clear cache periodically
if torch.cuda.is_available():
torch.cuda.empty_cache()
Incrementally write embeddings to disk
import numpy as np
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B",
tokenizer_kwargs={"padding_side": "left"})
model.eval()
embedding_dim = 768 # Check your model's dimension
mmap_file = np.memmap('embeddings.dat', dtype='float32', mode='w+',
shape=(100_000_000, embedding_dim))
offset = 0
with torch.no_grad():
for samples_page in my_paginated_samples_loader:
batch_embeddings = model.encode(samples_page, convert_to_numpy=True)
batch_size = len(batch_embeddings)
mmap_file[offset:offset + batch_size] = batch_embeddings
offset += batch_size
if torch.cuda.is_available():
torch.cuda.empty_cache()
max_seq_lengthis helping, seems like the issue is related to the dataset being multilingual (lots of unique tokens) github.com/huggingface/sentence-transformers/issues/1795