https://huggingface.co/sentence-transformers
uv init
uv run main.py
uv add sentence-transformers langchain-huggingface langchain-chroma fiftyone
sentence-transformers
: hugging face์์ ๊ด๋ฆฌํ๋ ํ
์คํธ ์๋ฒ ๋ฉ / ๋ฆฌ๋ญํฌ ๋ชจ๋ธ
langchain-huggingface
: Langchain - huggingface๋ฅผ ์๋ ์ญํ ์ ํ๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ
chromadb
: ์๋ฒ ๋ฉ๋ ๋ฒกํฐ๋ฅผ ์ ์ฅํ๋ DB
fifyone
: ์ด๋ฏธ์ง, ๋น๋์ค ๋ฐ์ดํฐ์
์๊ฐํ ๋ฐ ๋ถ์ ํดํท, huggingface์ ํตํฉ ์ง์
#1. ๋ชจ๋ธ ๋ก๋
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from sentence_transformers import SentenceTransformer
import fiftyone as fo
import numpy as np
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
#2. ์๋ฒ ๋ฉํ ๋ฌธ์ฅ ์ค๋น
sentences = [
"๋ฐ๊ฐ์ต๋๋ค. ์ ์ด๋ฆ์ ์กฐํ์ง์
๋๋ค.",
"์๋
ํ์ธ์. ์ ๋ ์กฐํ์ง์ด์์.",
"์ค๋ ๋ ์จ๋ ๋ง์ต๋๋ค."
]
#3. ๋ชจ๋ธ ์ด์ฉํด ๋ฌธ์ฅ ์๋ฒ ๋ฉ ์งํ
embeddings = model.encode(sentences)
print(embeddings.shape)
#4. ์๋ฒ ๋ฉ ์ ์ฌ๋ ๊ฒฐ๊ณผ ํ์ธ
similarity_matrix = model.similarity(embeddings, embeddings)
print(similarity_matrix)
#5. FittyOne์ ์ฌ์ฉํ ์ ์ฌ๋ ๊ฒฐ๊ณผ ์๊ฐํ - ์งํ ์ํจ
# dataset = fo.Dataset("sentence_similarity")
# ์ํ ์ถ๊ฐ
# for i, sentence in enumerate(sentences):
# sample = fo.Sample(
# filepath=f"sample_{i}.txt",
# text=sentence,
# embedding=embeddings[i].tolist()
# )
# dataset.add_sample(sample)
# ์๊ฐํ ์คํ
#session = fo.Session(dataset=dataset)
#session.launch()
shape
์ ์ถ๋ ฅ๊ฒฐ๊ณผ์ธ (3, 384)
๋ 3๊ฐ์ ๋ฌธ์ฅ์ ์๋ฒ ๋ฉํ๊ณ , ๊ฐ ์๋ฒ ๋ฉ๋ ๋ฒกํฐ์ ์ฐจ์์๊ฐ 384๋ผ๋ ์๋ฏธ์
๋๋ค.
tensor
๋ ๋ฌธ์ฅ ์๋ฒ ๋ฉ ๋ฒกํฐ๋ค ๊ฐ์ ์ฝ์ฌ์ธ ์ ์ฌ๋(cosineย similarity) ํ๋ ฌ์
๋๋ค.