Source code for app.alba_core.extraction
"""Requirement extraction for Alba Core.
This module owns the first step of the CRM/matching flow: turning a renter's
plain-English message into a :class:`RentalRequirements` object.
Inputs:
* the latest customer message;
* optional existing requirements from an earlier conversation turn;
* known locations gathered from the real PropertyMe-backed cache.
Outputs:
* structured requirements;
* extraction confidence;
* short notes explaining what was detected.
Do not add property ranking or lead-writing logic here. Extraction should only
describe what the renter asked for.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Iterable
from app.alba_core.models import RentalRequirements
NUMBER_WORDS = {
"one": 1,
"two": 2,
"three": 3,
"four": 4,
"five": 5,
"six": 6,
"seven": 7,
"eight": 8,
"nine": 9,
"ten": 10,
}
PROPERTY_TYPE_WORDS = {
"house": "house",
"home": "house",
"apartment": "apartment",
"apt": "apartment",
"flat": "apartment",
"unit": "apartment",
"townhouse": "townhouse",
"town house": "townhouse",
"townhome": "townhouse",
}
FEATURE_WORDS = {
"pool": "pool",
"spa": "spa",
"view": "view",
"sea view": "sea view",
"waterfront": "waterfront",
"garden": "garden",
"gym": "gym",
}
NO_PREFERENCE_PATTERNS = [
"no preference",
"not a concern",
"isn't a concern",
"isnt a concern",
"dont care",
"don't care",
"flexible",
"nothing",
]
[docs]
@dataclass(slots=True)
class ExtractionOutput:
"""Result returned by :class:`RequirementExtractor`.
The output keeps requirements and explanation notes together so demos,
tests, and future channel wrappers can show what Alba Core understood.
"""
requirements: RentalRequirements
confidence: float
notes: list[str]
[docs]
def to_dict(self) -> dict[str, object]:
"""Return the API-friendly extraction response."""
return {
"requirements": self.requirements.to_dict(),
"confidence": round(self.confidence, 2),
"notes": self.notes,
"missing_required_fields": self.requirements.missing_required_fields,
}
[docs]
class RequirementExtractor:
"""Deterministic extraction for common renter wording.
AI can be added later as a helper, but this class keeps the baseline
repeatable and easy to test.
"""
def __init__(self, known_locations: Iterable[str]) -> None:
"""Create an extractor grounded in locations Alba Core can serve.
Locations are sorted longest-first so names like "St Marys Bay" are
matched before shorter overlapping terms.
"""
self.known_locations = sorted(
{location.casefold() for location in known_locations if location},
key=len,
reverse=True,
)
[docs]
def extract(
self,
message: str,
current: RentalRequirements | None = None,
) -> ExtractionOutput:
"""Extract requirements from a single message and merge prior state.
This method is intentionally a readable sequence of small extraction
steps. When adding a new field, add the helper below and then call it in
this flow so new contributors can see the full order at a glance.
"""
current = current or RentalRequirements()
lower = message.casefold()
patch = RentalRequirements()
notes: list[str] = []
city_or_suburb = self._extract_location(lower)
if city_or_suburb:
patch.city = city_or_suburb.title()
notes.append(f"location detected: {patch.city}")
budget = self._extract_budget(lower)
if budget is not None:
patch.budget_max = budget
notes.append(f"budget detected: {budget:.0f}")
bedrooms = self._extract_count(lower, ["bed", "beds", "bedroom", "bedrooms", "br"])
if bedrooms is not None:
patch.bedrooms_min = bedrooms
notes.append(f"bedrooms detected: {bedrooms}+")
bathrooms = self._extract_count(lower, ["bath", "baths", "bathroom", "bathrooms"])
if bathrooms is not None:
patch.bathrooms_min = bathrooms
notes.append(f"bathrooms detected: {bathrooms}+")
parking = self._extract_parking(lower)
if parking is not None:
patch.parking_min = parking
notes.append("parking is not required" if parking == 0 else f"parking detected: {parking}+")
property_type = self._extract_property_type(lower)
if property_type:
patch.property_type = property_type
notes.append(f"property type detected: {property_type}")
move_in = self._extract_move_in(lower)
if move_in:
patch.move_in_timing = move_in
notes.append(f"move-in timing detected: {move_in}")
priority = self._extract_priority(lower)
if priority:
patch.priority = priority
notes.append(f"priority detected: {priority}")
patch.features = self._extract_features(lower)
if patch.features:
notes.append(f"features detected: {', '.join(patch.features)}")
self._extract_boolean_preferences(lower, patch)
patch.no_preference = self._extract_no_preference(lower)
merged = current.merge(patch)
confidence = self._confidence_for(merged, notes)
return ExtractionOutput(requirements=merged, confidence=confidence, notes=notes)
def _extract_location(self, lower: str) -> str | None:
"""Return the first known cache-backed location mentioned by the user."""
for location in self.known_locations:
if re.search(rf"\b{re.escape(location)}\b", lower):
return location
return None
def _extract_budget(self, lower: str) -> float | None:
"""Extract a weekly budget without confusing counts for money.
Renter messages often contain bare numbers for bedrooms, bathrooms, and
budget. The regex order below deliberately requires a money marker,
budget wording, or a phrase such as "under 5000" before treating a number
as rent.
"""
word_match = re.search(r"\b(one|two|three|four|five|six|seven|eight|nine|ten)\s+(?:grand|k)\b", lower)
if word_match:
return float(NUMBER_WORDS[word_match.group(1)] * 1000)
# Avoid treating counts such as "4 bedrooms" as money. A number becomes
# a budget only when it has a money marker or appears beside budget/rent wording.
money_match = re.search(r"(?:\$|nzd\s*)(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\s*(k|grand)?\b", lower)
if not money_match:
money_match = re.search(r"\b(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\s*(k|grand)\b", lower)
if not money_match:
money_match = re.search(
r"\b(?:budget|rent|weekly|per week|pw)\D{0,20}(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\b",
lower,
)
if not money_match:
money_match = re.search(
r"\b(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\D{0,20}(?:budget|rent|weekly|per week|pw)\b",
lower,
)
if not money_match:
money_match = re.search(
r"\b(?:under|up to|max|maximum|less than|below)\D{0,12}(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\b",
lower,
)
if not money_match:
return None
amount = float(money_match.group(1).replace(",", ""))
suffix = money_match.group(2) if len(money_match.groups()) > 1 else None
if suffix in {"k", "grand"} or amount < 100:
amount *= 1000
if amount < 150:
return None
return amount
def _extract_count(self, lower: str, labels: list[str]) -> int | None:
"""Extract a numeric or word count next to labels such as bedrooms."""
label_pattern = "|".join(re.escape(label) for label in labels)
digit_match = re.search(rf"\b(\d+)\s*(?:or\s+more\s+)?(?:{label_pattern})\b", lower)
if digit_match:
return int(digit_match.group(1))
word_pattern = "|".join(NUMBER_WORDS)
word_match = re.search(rf"\b({word_pattern})\s*(?:or\s+more\s+)?(?:{label_pattern})\b", lower)
if word_match:
return NUMBER_WORDS[word_match.group(1)]
return None
def _extract_parking(self, lower: str) -> int | None:
"""Extract parking need, including explicit flexible/no-parking wording."""
if "parking" not in lower and "garage" not in lower and "car space" not in lower:
return None
if any(pattern in lower for pattern in NO_PREFERENCE_PATTERNS):
return 0
count = self._extract_count(lower, ["parking", "garage", "garages", "car space", "car spaces"])
return count if count is not None else 1
def _extract_property_type(self, lower: str) -> str | None:
"""Normalise common property type synonyms to one internal value."""
for word, property_type in PROPERTY_TYPE_WORDS.items():
if re.search(rf"\b{re.escape(word)}\b", lower):
return property_type
return None
def _extract_move_in(self, lower: str) -> str | None:
"""Extract simple move-in timing phrases used by matching and summary."""
if re.search(r"\b(asap|immediately|right away|now)\b", lower):
return "asap"
if re.search(r"\bnext month\b", lower):
return "next month"
if re.search(r"\bin\s+\d+\s+weeks?\b", lower):
return re.search(r"\bin\s+\d+\s+weeks?\b", lower).group(0)
return None
def _extract_priority(self, lower: str) -> str | None:
"""Extract the user's stated priority only when the message asks for one."""
priorities = {
"location": ["location", "area", "suburb", "nice spot"],
"price": ["price", "budget", "cheap", "affordable"],
"space": ["space", "bedrooms", "large", "room"],
"move-in timing": ["move in", "asap", "available"],
"lifestyle extras": ["pool", "view", "waterfront", "garden", "spa"],
}
if "matters most" in lower or "priority" in lower or "important" in lower:
for priority, words in priorities.items():
if any(word in lower for word in words):
return priority
return None
def _extract_features(self, lower: str) -> list[str]:
"""Return lifestyle features that can later affect soft scoring."""
return [feature for word, feature in FEATURE_WORDS.items() if word in lower]
def _extract_boolean_preferences(self, lower: str, patch: RentalRequirements) -> None:
"""Patch simple yes/no preferences such as pets and furnishing."""
if "pet friendly" in lower or "pets allowed" in lower or "with pets" in lower:
patch.pets_required = True
elif "no pets" in lower or "without pets" in lower:
patch.pets_required = False
if "furnished" in lower and "unfurnished" not in lower:
patch.furnished_required = True
elif "unfurnished" in lower or "not furnished" in lower:
patch.furnished_required = False
def _extract_no_preference(self, lower: str) -> list[str]:
"""Record fields the renter explicitly says are flexible."""
if not any(pattern in lower for pattern in NO_PREFERENCE_PATTERNS):
return []
fields: list[str] = []
if "parking" in lower:
fields.append("parking")
if "bath" in lower:
fields.append("bathrooms")
if "must" in lower or "feature" in lower or "extra" in lower:
fields.append("features")
if not fields:
fields.append("unspecified")
return fields
def _confidence_for(self, requirements: RentalRequirements, notes: list[str]) -> float:
"""Estimate confidence from detected fields and search readiness."""
score = 0.35
score += min(len(notes) * 0.08, 0.35)
if requirements.is_search_ready:
score += 0.25
return min(score, 0.95)