diff --git a/pom.xml b/pom.xml index 8169ff7..3eb4052 100644 --- a/pom.xml +++ b/pom.xml @@ -10,9 +10,11 @@ big-data http://maven.apache.org - - UTF-8 - + + UTF-8 + 16 + 16 + @@ -26,5 +28,20 @@ 4.7 test + + org.slf4j + slf4j-api + 1.7.32 + + + org.slf4j + slf4j-simple + 1.7.5 + + + com.google.guava + guava + 31.0.1-jre + diff --git a/src/main/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java b/src/main/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java new file mode 100644 index 0000000..1f194e3 --- /dev/null +++ b/src/main/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java @@ -0,0 +1,135 @@ +package nearsoft.academy.bigdata.recommendation; + +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; +import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood; +import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; + +public class MovieRecommender { + + private static final Logger log = LoggerFactory.getLogger(MovieRecommender.class); + + public static final String ROOT_PATH = System.getProperty("user.dir"); + private final String CSV_PATH = ROOT_PATH + "/src/main/resources/movies.csv"; + + private final Map users; + private final BiMap movies; + + private Long totalReviews; + private static final Integer RECORDS = 1000000; + private static final Double THRESHOLD = 0.1; + private static final Integer RECOMMENDATIONS_NUM = 3; + + public MovieRecommender(String filePath) { + users = new HashMap<>(); + movies = HashBiMap.create(); + totalReviews = 0L; + init(filePath); + } + + public void init(String filePath){ + try { + parseToCSV(readGZIPFile(filePath)); + } catch (IOException e) { + log.error(e.getMessage()); + } + } + + public BufferedReader readGZIPFile(String filePath) throws IOException { + var fileInputStream = new FileInputStream(filePath); + var gzipInputStream = new GZIPInputStream(fileInputStream); + var inputStreamReader = new InputStreamReader(gzipInputStream); + return new BufferedReader(inputStreamReader); + } + + public void parseToCSV(BufferedReader bufferedReader) throws IOException { + + log.info("Parsing file to CSV"); + + var fileWriter = new FileWriter(CSV_PATH); + + String userID = null, movieID = null, score = null; + + String currentLine; + while ((currentLine = bufferedReader.readLine()) != null) { + if (currentLine.contains(":")) { + + var field = currentLine.split(":", 2); + var fieldKey = field[0]; + var fieldValue = field[1].trim(); + + switch (fieldKey) { + case "product/productId" -> { + movieID = fieldValue; + var movieNum = (long) (movies.size() + 1); + movies.putIfAbsent(movieID, movieNum); + } + case "review/userId" -> { + userID = fieldValue; + var userNum = (long) (users.size() + 1); + users.putIfAbsent(userID, userNum); + } + case "review/score" -> { + score = fieldValue; + totalReviews++; + } + } + + if (Objects.nonNull(userID) && Objects.nonNull(movieID) && Objects.nonNull(score)) { + fileWriter.write(String.format("%s,%s,%s\n", users.get(userID), movies.get(movieID), score)); + userID = movieID = score = null; + if (totalReviews % RECORDS == 0){ + log.info(String.format("Parsed %d lines", totalReviews)); + } + } + + } + } + fileWriter.close(); + + log.info(String.format("File parsed. Total lines %d", totalReviews)); + } + + public long getTotalReviews() { + return totalReviews; + } + + public long getTotalProducts() { + return movies.size(); + } + + public long getTotalUsers() { + return users.size(); + } + + public List getRecommendationsForUser(String userID) throws IOException, TasteException { + var file = new File(CSV_PATH); + var dataModel = new FileDataModel(file); + var userSimilarity = new PearsonCorrelationSimilarity(dataModel); + var neighborhood = new ThresholdUserNeighborhood(THRESHOLD, userSimilarity, dataModel); + var recommender = new GenericUserBasedRecommender(dataModel, neighborhood, userSimilarity); + return recommender.recommend(users.get(userID), RECOMMENDATIONS_NUM) + .stream() + .map(movie -> movies.inverse().get(movie.getItemID())) + .collect(Collectors.toList()); + } + +} diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java index 0d0b1fe..2cdbb48 100644 --- a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java @@ -6,6 +6,7 @@ import java.io.IOException; import java.util.List; +import static nearsoft.academy.bigdata.recommendation.MovieRecommender.ROOT_PATH; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThat; import static org.junit.matchers.JUnitMatchers.hasItem; @@ -15,7 +16,7 @@ public class MovieRecommenderTest { public void testDataInfo() throws IOException, TasteException { //download movies.txt.gz from // http://snap.stanford.edu/data/web-Movies.html - MovieRecommender recommender = new MovieRecommender("/path/to/movies.txt.gz"); + MovieRecommender recommender = new MovieRecommender(ROOT_PATH + "/src/main/resources/movies.txt.gz"); assertEquals(7911684, recommender.getTotalReviews()); assertEquals(253059, recommender.getTotalProducts()); assertEquals(889176, recommender.getTotalUsers());