ElasticSearch 详解(五)

发布于 2021-10-25  152 次阅读


京东搜索实战

项目源码会放在我的网盘 学习-ElasticSearch 目录下的 es-jingdong 压缩包。

添加依赖

新建 SpringBoot 项目,导入 Spring Boot DevTools、Lombok、Spring Configuration Processor、SpringWeb、Thymeleaf 和 Spring Data ElasticSearch 依赖。最后再手动添加 fastjson 依赖。

<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.78</version>
</dependency>

页面跳转控制器

package ml.guest997.controller;

import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;

@Controller
public class IndexController {
    @RequestMapping({"/", "/index"})
    public String index() {
        return "index";
    }
}

ES 配置类

package ml.guest997.config;

import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

@Configuration
public class ElasticSearchClientConfig {
    @Bean
    public RestHighLevelClient restHighLevelClient() {
        return new RestHighLevelClient(RestClient.builder(new HttpHost("localhost", 9200, "http")));
    }
}

爬取数据

添加依赖

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.14.3</version>
</dependency>

POJO 层

package ml.guest997.pojo;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@AllArgsConstructor
@NoArgsConstructor
public class Content {
    private String img;
    private String price;
    private String title;
}

编写爬虫工具类

package ml.guest997.utils;

import ml.guest997.pojo.Content;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

@Component
public class HtmlParseUtil {
    public List<Content> parseJD(String keyword) throws Exception {
        String url = "https://search.jd.com/Search?keyword=" + keyword + "&enc=utf-8";
        //解析网页,这里的 Document 对象就是 js 中讲到的浏览器文档对象。
        Document document = Jsoup.parse(new URL(url), 30000);
        //下面的方法都是为了获取搜索页面的标签中的数据并封装成 Content 对象,再将所有的对象放入 List 中。
        Element element = document.getElementById("J_goodsList");
        Elements elements = element.getElementsByTag("li");
        List<Content> contents = new ArrayList<>();
        for (Element el : elements) {
            String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");    //获取标签属性值
            String price = el.getElementsByClass("p-price").eq(0).text();       //获取标签文本值
            String title = el.getElementsByClass("p-name").eq(0).text();
            contents.add(new Content(img, price, title));
        }
        return contents;
    }

    //测试是否能正常爬取数据
    public static void main(String[] args) throws Exception {
        new HtmlParseUtil().parseJD("java").forEach(System.out::println);
    }

}

Service 层

package ml.guest997.service;

import com.alibaba.fastjson.JSON;
import ml.guest997.pojo.Content;
import ml.guest997.utils.HtmlParseUtil;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.xcontent.XContentType;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.List;

@Service
public class ContentService {

    @Autowired
    private RestHighLevelClient restHighLevelClient;

    public Boolean parseContent(String keyword) throws Exception {
        //记得先在 kibana 创建索引:PUT /jd_goods
        List<Content> contents = new HtmlParseUtil().parseJD(keyword);
        //把查询到的多条数据放入 es 中
        BulkRequest bulkRequest = new BulkRequest();
        bulkRequest.timeout("2m");
        for (Content content : contents) {
            bulkRequest.add(new IndexRequest("jd_goods").source(JSON.toJSONString(content), XContentType.JSON));
        }
        BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
        return !bulk.hasFailures();
    }

}

Controller 层

package ml.guest997.controller;

import ml.guest997.service.ContentService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RestController;

@RestController
public class ContentController {

    @Autowired
    private ContentService contentService;

    @GetMapping("/parse/{keyword}")
    public Boolean parse(@PathVariable("keyword") String keyword) throws Exception {
        return contentService.parseContent(keyword);
    }

}

测试

访问 localhost:9090/parse/java,发现页面出现了 true。然后在 kibana 中查看是否成功添加了数据。(post /jd_goods/_doc/_search)

分页实现

前提是 es 中已经有足够多的数据了。

Service 层

public List<Map<String, Object>> searchPage(String keyword, int pageNo, int pageSize) throws IOException {
    if (pageNo <= 1) {
        pageNo = 1;
    }
    //条件搜索
    SearchRequest searchRequest = new SearchRequest("jd_goods");
    SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
    //分页
    sourceBuilder.from(pageNo);
    sourceBuilder.size(pageSize);
    //精准匹配
    TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
    sourceBuilder.query(termQueryBuilder);
    sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
    //执行搜索
    searchRequest.source(sourceBuilder);
    SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
    //解析结果
    List<Map<String, Object>> list = new ArrayList<>();
    for (SearchHit documentFields : searchResponse.getHits()) {
        list.add(documentFields.getSourceAsMap());
    }
    return list;
}

Controller 层

@GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
public List<Map<String, Object>> search(@PathVariable("keyword") String keyword, @PathVariable("pageNo") int pageNo, @PathVariable("pageSize") int pageSize) throws IOException {
    return contentService.searchPage(keyword, pageNo, pageSize);
}

测试

访问 localhost:9090/search/java/1/20,发现页面出现了很多条数据,说明成功了。

搜索高亮实现

Service 层

public List<Map<String, Object>> searchPageHighlight(String keyword, int pageNo, int pageSize) throws IOException {
    if (pageNo <= 1) {
        pageNo = 1;
    }
    //条件搜索
    SearchRequest searchRequest = new SearchRequest("jd_goods");
    SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
    //分页
    sourceBuilder.from(pageNo);
    sourceBuilder.size(pageSize);
    //精准匹配
    TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
    sourceBuilder.query(termQueryBuilder);
    sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
    //高亮
    HighlightBuilder highlightBuilder = new HighlightBuilder();
    highlightBuilder.field("title");
    highlightBuilder.requireFieldMatch(false);      //是否多个关键词高亮显示
    highlightBuilder.preTags("<span style='color:red'>");
    highlightBuilder.postTags("</span>");
    sourceBuilder.highlighter(highlightBuilder);
    //执行搜索
    searchRequest.source(sourceBuilder);
    SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
    //解析结果
    ArrayList<Map<String, Object>> list = new ArrayList<>();
    for (SearchHit hit : searchResponse.getHits().getHits()) {
        Map<String, HighlightField> highlightFields = hit.getHighlightFields();
        HighlightField title = highlightFields.get("title");
        Map<String, Object> sourceAsMap = hit.getSourceAsMap();     //原来的结果
        //解析高亮的字段,将原来的字段换为高亮的字段。
        if (title != null) {
            Text[] fragments = title.fragments();
            String n_title = "";
            for (Text text : fragments) {
                n_title += text;
                sourceAsMap.put("title", n_title);
            }
            list.add(sourceAsMap);
        }
    }
    return list;
}

Controller 层

直接将分页中的修改一下即可。

@GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
public List<Map<String, Object>> search(@PathVariable("keyword") String keyword, @PathVariable("pageNo") int pageNo, @PathVariable("pageSize") int pageSize) throws IOException {
    return contentService.searchPageHighlight(keyword, pageNo, pageSize);
}

测试

访问 localhost:9090/search/java/1/20,发现页面出现了很多条数据,并且 java 的字样都加了 span 标签,就说明成功了。

前后端分离

使用了 vue 和 axios,具体的代码可以到源码中去看。