import requests
from bs4 import BeautifulSoup

url = "http://www.paulgraham.com/greatwork.html"

page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")

text = soup.body.get_text()

text[:435]

'July 2023If you collected lists of techniques for doing great work in a lot\nof different fields, what would the intersection look like? I decided\nto find out by making it.Partly my goal was to create a guide that could be used by someone\nworking in any field. But I was also curious about the shape of the\nintersection. And one thing this exercise shows is that it does\nhave a definite shape; it\'s not just a point labelled "work hard.'

import re

text = text[9:]
text = re.sub("\n", " ", text)
text = re.sub("\[\d+\]", "", text)
chunks = [s.strip() for s in re.split("\.|\?", text) if len(s.strip())]

len(chunks)

758

for chunk in chunks[:5]:
    print(">", chunk)

> If you collected lists of techniques for doing great work in a lot of different fields, what would the intersection look like
> I decided to find out by making it
> Partly my goal was to create a guide that could be used by someone working in any field
> But I was also curious about the shape of the intersection
> And one thing this exercise shows is that it does have a definite shape; it's not just a point labelled "work hard

import transformers
import torch
import torch.nn.functional as F

model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
embedding_model = transformers.AutoModel.from_pretrained(model_name)


def mean_pooling(last_hidden_state, attention_mask):
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    )
    return torch.sum(last_hidden_state * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


def get_embeddings(sentences, tokenizer, model):
    if isinstance(sentences, str):
        sentences = [sentences]
    encoded_input = tokenizer(
        sentences, padding=True, truncation=True, return_tensors="pt"
    )
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = mean_pooling(
        model_output.last_hidden_state, encoded_input["attention_mask"]
    )
    embeddings = F.normalize(embeddings, p=2, dim=-1)
    return embeddings

chunk_embeddings = get_embeddings(chunks, tokenizer, embedding_model)

chunk_embeddings.shape

torch.Size([758, 384])

query = "What should I work on?"

query_embedding = get_embeddings(query, tokenizer, embedding_model)

query_embedding.shape

torch.Size([1, 384])

cos_similarity = torch.cosine_similarity(query_embedding, chunk_embeddings)

dot_similarity = torch.mm(query_embedding, chunk_embeddings.T).squeeze(0)

cos_similarity.shape, dot_similarity.shape

(torch.Size([758]), torch.Size([758]))

torch.allclose(cos_similarity, dot_similarity)

True

abs(dot_similarity - cos_similarity).max()

tensor(1.1921e-07)

indices = torch.argsort(cos_similarity, descending=True)

k = 10

retrieved_chunks = [chunks[i.item()] for i in indices[:k]]

retrieved_chunks

["If you're not sure what to work on, guess",
 "What should you do if you're young and ambitious but don't know what to work on",
 'The way to figure out what to work on is by working',
 'The first step is to decide what to work on',
 "If in the course of working on one thing you discover another that's more exciting, don't be afraid to switch",
 'Most people who do great work have a mix, and the more you have of the former, the harder it will be to decide what to do',
 'The work you choose needs to have three qualities: it has to be something you have a natural aptitude for, that you have a deep interest in, and that offers scope to do great work',
 "When you're young you don't know what you're good at or what different kinds of work are like",
 'What should your projects be',
 "Let's talk a little more about the complicated business of figuring out what to work on"]

def get_prompt(query, retrieved_chunks):
    prompt = "Here's some text extracted from a document:\n"
    for chunk in retrieved_chunks:
        prompt += f"- {chunk}\n"
    prompt += "\n"
    prompt += "Answer the following question by using the above extracted text only):\n"
    prompt += query
    prompt += "\n"
    prompt += "Do not mention the query was answered using extracted text."
    return prompt

prompt = get_prompt(query, retrieved_chunks)

print(prompt)

Here's some text extracted from a document:
- If you're not sure what to work on, guess
- What should you do if you're young and ambitious but don't know what to work on
- The way to figure out what to work on is by working
- The first step is to decide what to work on
- If in the course of working on one thing you discover another that's more exciting, don't be afraid to switch
- Most people who do great work have a mix, and the more you have of the former, the harder it will be to decide what to do
- The work you choose needs to have three qualities: it has to be something you have a natural aptitude for, that you have a deep interest in, and that offers scope to do great work
- When you're young you don't know what you're good at or what different kinds of work are like
- What should your projects be
- Let's talk a little more about the complicated business of figuring out what to work on

Answer the following question by using the above extracted text only):
What should I work on?
Do not mention the query was answered using extracted text.

import openai

openai.api_key = "YOUR_API_KEY_HERE"

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.0,
)

response["choices"][0]["message"]["content"]

"You should work on something that you have a natural aptitude for, a deep interest in, and that offers scope to do great work. If you're unsure, start working on something and if in the course of working on one thing you discover another that's more exciting, don't be afraid to switch. The first step is to decide what to work on, and the way to figure that out is by actually working."

def answer_query(query, tokenizer, model, chunk_embeddings, k):
    query_embedding = get_embeddings(query, tokenizer, model)
    similarity = torch.cosine_similarity(query_embedding, chunk_embeddings)
    indices = torch.argsort(similarity, descending=True)
    retrieved_chunks = [chunks[i.item()] for i in indices[:k]]
    prompt = get_prompt(query, retrieved_chunks)
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
    )
    return response["choices"][0]["message"]["content"]

answer_query("Should I take risks?", tokenizer, embedding_model, chunk_embeddings, k)

"Yes, you should take risks. It's important to take as much risk as you can afford, especially when you're young. Risk often comes with the fear of rejection and failure, but it's a necessary part of discovering new things. Sharing your ideas, despite the risk, can lead to new discoveries. Remember, in an efficient market, risk is proportionate to reward. So, instead of looking for certainty, look for a bet with high expected value."

Retrieval augmented generation¶

Retrieving Information¶

Generating Text¶